diff --git a/CODEOWNERS b/CODEOWNERS
index 007a304c3e706ce968576ec8979c08f1a3bcc552..b9f0313cc6d59d3fbdcd014e1a528126d863075a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,7 +45,7 @@
 # /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 # /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 # /tensorflow/contrib/stateless/ @girving
-# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
 # /tensorflow/contrib/testing/ @dandelionmane
 # /tensorflow/contrib/timeseries/ @allenlavoie
 # /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
diff --git a/README.md b/README.md
index c66f7e3f3f49ed90e4e75475185585a932049f37..e1a50c87e26d493ba3ac760f357905d89aa40dab 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,14 @@
 
 | **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
diff --git a/RELEASE.md b/RELEASE.md
index e8459531748628fd822d876d79625fdd65798791..2717c75740aeea7821fb6c57dfc85908e86e9d51 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,61 @@
+# Release 1.8.0
+
+## Major Features And Improvements
+* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine.
+* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory.
+* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
+* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
+* `tf.contrib.bayesflow` is moving out to it's own repo.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
+  * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
+  * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
+* Eager Execution:
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled.
+  * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
+  * `tf.GradientTape` has moved out of contrib.
+* `tf.keras`:
+  * Added the fashion mnist dataset.
+  * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`.
+* Accelerated Linear Algebra (XLA):
+  * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
+* TensorFlow Debugger (tfdbg) CLI:
+  * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+  * Fix spurious background colors in some text terminals.
+* `tf.contrib`:
+  * Add meta-distribution BatchReshape which reshapes batch dimensions.
+  * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
+  * Add `tf.contrib.framework.argsort`.
+  * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses.
+  * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`.
+  * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched.
+* Other:
+  * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch.
+  * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`.
+  * Update scatter operations:
+    * Add `tf.scatter_min` and `tf.scatter_max`
+    * Extend scatter operations to work with a scalar update parameter.
+  * Move cuDNN RNN ops to core for use in TensorFlow codebase only.
+  * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`.
+  * Add `float64` support for `AvgPool`/`AvgPoolGrad`.
+  * Make graph name scope thread local so that they work correctly in multi-threaded environments.
+  * Update nsync synchronization library to avoid slow primitives on Linux.
+  * Removed need to put nsync/public on C include path when building custom ops.
+  * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`.
+  * Add links to https://js.tensorflow.org.
+  * Fix non-uniformity of orthogonal matrices.
+  * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
+
+
 # Release 1.7.0
 
 ## Major Features And Improvements
diff --git a/WORKSPACE b/WORKSPACE
index 11c5cdb2070e79b16540a39f13cab28608962340..4ddfb9a3832ea1ea639ace887e1d601bdd857086 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
+    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
+    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
     ],
 )
 
diff --git a/configure.py b/configure.py
index 81d5ad77ee48b101c2f55baf5b3ee935dab756c8..8fb8979111627b9b25be80c77c611932880e011d 100644
--- a/configure.py
+++ b/configure.py
@@ -1516,7 +1516,8 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
-    set_tf_nccl_install_path(environ_cp)
+      set_tf_nccl_install_path(environ_cp)
+
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 823393ebdf1f4b658361f31963a275a683e61002..f2ad16fa04f5beb6616c58c28d0f0c460c3e3a17 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -401,25 +401,6 @@ package_group(
     ],
 )
 
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
-
-py_library(
-    name = "experimental_tensorflow_py",
-    srcs = ["experimental_api.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow/tools/api/tests:__subpackages__"],
-    deps = [
-        "//tensorflow/python",
-        "//tensorflow/tools/api/generator:python_api",
-    ],
-)
-
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -469,11 +450,12 @@ tf_cc_shared_object(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:gpu_runtime_impl",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
-        "//tensorflow/core:gpu_runtime_impl",
     ] + tf_additional_binary_deps(),
 )
 
@@ -553,3 +535,14 @@ exports_files(
         "tf_exported_symbols.lds",
     ],
 )
+
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python",
+        "//tensorflow/tools/api/generator:python_api",
+    ],
+)
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 78ad6aec19f3bbbfcb389012ac1577573b3e4901..c8683e3976c90add3f1f54d8e575c798327e9273 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,14 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.python import *  # pylint: disable=redefined-builtin
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 2367014cd02c721ea96581919c3efc96e772d9a6..8a9301d584775cff3ae315e6fd856b00d1734248 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -122,6 +122,7 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fe85f8ee0ed2c58c3ba9201a9ca895c9ec48c022..c8594347451dffd465d7fa926cc53818dc9e38d4 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -72,7 +72,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -80,7 +80,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e82a5460920fc057303e326c8c749968141261f1..d3916bc16778a942b7eab4df93bbc19955b19e31 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 using tensorflow::FunctionDef;
@@ -183,12 +184,19 @@ library {
   return std::move(functions[0]);
 }
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
 //  be deleted by calling TF_DeleteFunction.
 static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
     const char* file_path, std::string* dataset_name, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return std::vector<UniqueFuncPtr>();
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -7067,8 +7075,11 @@ library {
         DCHECK(found);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
+#endif
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads an MNIST file dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -7076,6 +7087,12 @@ library {
 static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -8205,7 +8222,9 @@ library {
         DCHECK(found_batch_size);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
+#endif
 
 // Adds the input functions to `graph`.  On success, returns the created
 // IteratorGetNext node.
@@ -8299,6 +8318,13 @@ TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
 TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     TF_Graph* graph, const char* file_path, int batch_size,
     unsigned char is_mnist, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): get these functions working on Windows.
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   tensorflow::Status s;
 
   std::string dataset_name;
@@ -8340,4 +8366,5 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
           << graph->graph.ToGraphDefDebug().DebugString();
 
   return getnext_node;
+#endif
 }
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 666342974ee0a51b707955cf7468e914fead85b3..88cb173cd25f4219e32392f6722a6ea7d358a553 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -35,7 +35,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -43,7 +43,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 3926c22ce1f9e194b1452c796c83944d10cfdc64..c06ce84a8c578aa60dd626c24bd58098b78ae750 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -38,7 +38,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index fa03b1f3c2dfc334d4a3871e6a1bf5503fa8d5f8..19e6bf68e77725bb3cae4e1d338c52dff472cb18 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 5772776666129ed55a479c8917e69df3f3ce2fc0..5e74079fc158379b8977ada6412141e39142c3d3 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,7 +31,7 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
-#elif defined(COMPILER_MSVC)
+#elif defined(_WIN32)
   return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
@@ -48,7 +48,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 }
 
 inline void aligned_free(void* aligned_memory) {
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
   _aligned_free(aligned_memory);
 #else
   free(aligned_memory);
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index b053dad1b57c258b7cb0d6831923e6a0f30f5e7e..bb73cb19c57a654058af5bbb4535c76b0aca8e8c 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -14,6 +14,7 @@ test_suite(
         ":test_graph_tfadd_test",
         ":test_graph_tfadd_with_ckpt_saver_test",
         ":test_graph_tfadd_with_ckpt_test",
+        ":test_graph_tfassert_eq_test",
         ":test_graph_tffunction_test",
         ":test_graph_tfgather_test",
         ":test_graph_tfmatmul_test",
@@ -33,6 +34,7 @@ py_binary(
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -52,6 +54,7 @@ genrule(
         "test_graph_tfadd_with_ckpt_saver.ckpt",
         "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
+        "test_graph_tfassert_eq.pb",
         "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
@@ -104,6 +107,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfassert_eq",
+    testonly = 1,
+    config = "test_graph_tfassert_eq.config.pbtxt",
+    cpp_class = "AssertComp",
+    graph = "test_graph_tfassert_eq.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
     name = "test_graph_tffunction",
     testonly = 1,
@@ -170,6 +184,7 @@ tf_cc_test(
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
+        ":test_graph_tfassert_eq",
         ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 89c7cd4507cbd476104a039d6083d8f89de11278..67767f55dae9b15aafbd8b129328bde2c59a9ef3 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import app
@@ -125,6 +126,14 @@ def tfsplits(_):
   array_ops.identity(y, name='result')
 
 
+def tfassert_eq(_):
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  control_flow_ops.Assert(
+      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
+  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -144,6 +153,7 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
+  write_graph(tfassert_eq, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8732d1709e809bb47d3769c483483c2c4f350e1c
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfassert_eq.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "x_y_diff" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 413efd9cea3b6f71574615ad9ca92471ff925781..67dbd643bfc7bf2c214e7eb5ae8bd2cc7d6e164b 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
@@ -413,6 +414,23 @@ TEST(TFCompileTest, Splits) {
   EXPECT_NEAR(expected[3], fn.result0(1, 1), 1e4);
 }
 
+TEST(TFCompileTest, AssertEqAndReturnDiff) {
+  // Assert is converted into a no-op in XLA, so there is no failure even if the
+  // two args are different.
+  AssertComp assert;
+  EXPECT_EQ(assert.arg0_data(), assert.args()[0]);
+  EXPECT_EQ(assert.arg1_data(), assert.args()[1]);
+
+  assert.arg0() = 2;
+  assert.arg1() = 1;
+  const int32 expected_result = assert.arg0() - assert.arg1();
+  EXPECT_TRUE(assert.Run());
+  EXPECT_EQ(assert.error_msg(), "");
+  EXPECT_EQ(assert.result0(), expected_result);
+  EXPECT_EQ(assert.result0_data()[0], expected_result);
+  EXPECT_EQ(assert.result0_data(), assert.results()[0]);
+}
+
 TEST(TFCompileTest, LookupNameIndex) {
   // add doesn't have any names defined in its config.
   AddComp add;
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index a492fc6b9bb83dd64f1528bd4d8f13d640c5fbed..6edeb7047f9355ced9cbc04da75085191008b388 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -183,6 +183,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "shape_inference_helpers",
+    srcs = ["shape_inference_helpers.cc"],
+    hdrs = ["shape_inference_helpers.h"],
+    deps = ["//tensorflow/core:graph"],
+)
+
 # Internal targets below this point.
 
 cc_library(
@@ -293,6 +300,7 @@ cc_library(
     deps = [
         ":common",
         ":graph_to_functiondef",
+        ":shape_inference_helpers",
         ":union_find",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
@@ -318,6 +326,25 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
+cc_library(
+    name = "producer_consumer_queue",
+    hdrs = ["producer_consumer_queue.h"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "producer_consumer_queue_test",
+    size = "small",
+    srcs = ["producer_consumer_queue_test.cc"],
+    deps = [
+        ":producer_consumer_queue",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "graph_to_functiondef_test",
     size = "small",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index b04b333141a616e7c4db2751c14ec6eb0b7725b5..9465385b5856baf4d03f280ff30572e196a7663b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
@@ -576,7 +578,8 @@ class Encapsulator {
   // satisfied, e.g., because send_node depends on a node that doesn't have a
   // registered shape inference function.
   Status DoStaticShapeInferenceForOutsideCompilationSend(
-      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+      const ShapeRefiner& shape_refiner,
       const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
       FunctionLibraryDefinition* library,
       std::vector<TensorShapeProto>* static_shape_out,
@@ -599,7 +602,7 @@ class Encapsulator {
   // to nodes in pruned_graph.
   Status MakeGraphForOutsideCompilationSends(
       const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-      ShapeRefiner* shape_refiner,
+      BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
       std::unordered_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
@@ -1712,9 +1715,13 @@ namespace {
 // matter because it will only be used subsequently for shape inference. (It
 // would be possible to add a switch statement over data_type to create a value
 // for the constant, but that would entail maintaining the logic as new types
-// are added, and is not necessary.)
-Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
-                         Graph* graph_out) {
+// are added, and is not necessary.) If the node being replaced was within a
+// control flow frame, adds appropriate Enter nodes so that the use of the Const
+// is well-formed.
+Node* AddDummyShapedNode(const Node* src_node, int src_port,
+                         const std::vector<ControlFlowInfo>& control_flow_info,
+                         const TensorShapeProto& shape, Graph* graph_out) {
+  DataType data_type = src_node->output_type(src_port);
   TensorProto dummy_proto;
   dummy_proto.set_dtype(data_type);
   *dummy_proto.mutable_tensor_shape() = shape;
@@ -1725,7 +1732,23 @@ Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
   NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
                            options.op_registry());
   node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
-  return options.FinalizeBuilder(&node_builder);
+  Node* node = options.FinalizeBuilder(&node_builder);
+  // Add any Enter nodes required to bring the constant to the correct control
+  // flow frame.
+  while (!control_flow_info[src_node->id()].frame_name.empty()) {
+    NodeBuilder enter_builder(options.GetNameForOp("Enter"), "Enter",
+                              options.op_registry());
+    enter_builder.Attr("frame_name",
+                       control_flow_info[src_node->id()].frame_name);
+    enter_builder.Attr("is_constant", true);
+    enter_builder.Input(node, 0);
+    Node* enter_node = options.FinalizeBuilder(&enter_builder);
+    // Adopt the new Enter node as the value in the current frame.
+    node = enter_node;
+    // Recurse to the parent frame to see if more Enter nodes need to be added.
+    src_node = control_flow_info[src_node->id()].parent_frame;
+  }
+  return node;
 }
 
 // Adds a copy of node_in to graph_out and adds the mapping to
@@ -1767,17 +1790,30 @@ Status CopyShapeInferenceNodeToGraph(
       }
     }
   }
+  // Work around the fact that Enter nodes refuse to propagate shape information
+  // unless they are marked loop invariant. Since we are never going to execute
+  // this graph, marking them all loop invariant is fine.
+  if (node_out->type_string() == "Enter") {
+    node_out->ClearAttr("is_constant");
+    node_out->AddAttr("is_constant", true);
+  }
   return Status::OK();
 }
 
 }  // namespace
 
 Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
-    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
+    const ShapeRefiner& shape_refiner,
     const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
     FunctionLibraryDefinition* library,
     std::vector<TensorShapeProto>* static_shape_out,
     std::unique_ptr<Graph>* graph_out) {
+  // Get the control flow structure of the input graph so we can build
+  // well-formed output graphs.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(&graph_in, &control_flow_info));
+
   // Maps from nodes in graph_in to nodes in graph_out.
   //
   // When an edge has fully defined shape the source node in graph_in is
@@ -1802,7 +1838,6 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
 
   // We don't use the standard ReverseDFS because we want to cut off traversal
   // whenever we find an output with fully defined shape.
-  // TODO(misard) make this work properly in the presence of control flow.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving node?
@@ -1840,8 +1875,9 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             TensorShapeProto proto;
             context->ShapeHandleToProto(shape, &proto);
             if (dummy_node_images.find(src_node) == dummy_node_images.end()) {
-              dummy_node_images[src_node] = AddDummyShapedNode(
-                  src_node->output_type(src_port), proto, graph_out->get());
+              dummy_node_images[src_node] =
+                  AddDummyShapedNode(src_node, src_port, control_flow_info,
+                                     proto, graph_out->get());
             }
             // The final input to the send node is the dynamic key, which we
             // don't include in the static shapes.
@@ -1889,6 +1925,38 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
     }
   }
 
+  for (const auto edge : back_edge_helper.RemovedEdges()) {
+    if (copied_node_images.find(edge.dst) != copied_node_images.end()) {
+      // The destination of this back edge was added to the inference graph, so
+      // fix it up.
+      Node* dst = copied_node_images[edge.dst];
+      if (dst->type_string() != "Merge") {
+        return errors::InvalidArgument(
+            "outside_compilation cluster contains a back-edge to node ",
+            dst->name(), " of type ", dst->type_string(),
+            ". The analysis pass only supports back-edges to Merge nodes.");
+      }
+      const Edge* existing_input_edge;
+      if (edge.dst_input != 1 || dst->num_inputs() != 2 ||
+          !dst->input_edge(0, &existing_input_edge).ok()) {
+        // TODO(misard) if we see graphs built with a different structure, relax
+        // this constraint. Leaving it here for now to avoid writing unnecessary
+        // complex code since we believe graphs generated by front ends all have
+        // the back edge as the second input to the merge node.
+        return errors::Internal(
+            "Internal assumption failed while rewriting an outside_compilation "
+            "cluster that contains a while loop. Logic assumes back-edge is to "
+            "port 1 of a 2-input "
+            "Merge node.");
+      }
+      // Connect the existing edge to both inputs of the Merge node so that the
+      // graph will be well-formed.
+      (*graph_out)
+          ->AddEdge(existing_input_edge->src(),
+                    existing_input_edge->src_output(), dst, edge.dst_input);
+    }
+  }
+
   return Status::OK();
 }
 
@@ -1956,7 +2024,7 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
 
 Status Encapsulator::MakeGraphForOutsideCompilationSends(
     const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-    ShapeRefiner* shape_refiner,
+    BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
     std::unordered_map<const Node*, Node*>* node_images,
     FunctionLibraryDefinition* library) {
   // Find all the send_from_host nodes in all subgraphs, to use as roots for the
@@ -1978,10 +2046,15 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
   // nodes, inlining any functions as needed.
   TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
       graph, send_from_host_nodes, pruned_graph, node_images, library));
+  FixupSourceAndSinkEdges(pruned_graph->get());
+
+  // Remove back edges from any cycles in the pruned graph to simplify shape
+  // inference traversal. They will be fixed up in the per-subgraph shape
+  // inference graphs stored in the function library.
+  TF_RETURN_IF_ERROR(back_edge_helper->Remove(pruned_graph->get()));
 
   // Perform shape inference on the pruned graph.
   shape_refiner->set_require_shape_inference_fns(false);
-  FixupSourceAndSinkEdges(pruned_graph->get());
   std::vector<Node*> post_order;
   GetReversePostOrder(*(*pruned_graph), &post_order);
   for (auto node : post_order) {
@@ -1999,11 +2072,13 @@ Status Encapsulator::MakeGraphForOutsideCompilationSends(
 
 Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
     Graph* graph_out, FunctionLibraryDefinition* library) {
+  BackEdgeHelper back_edge_helper;
   std::unique_ptr<Graph> pruned_graph;
   ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
   std::unordered_map<const Node*, Node*> node_images;
   TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
-      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+      *graph_out, &pruned_graph, &back_edge_helper, &shape_refiner,
+      &node_images, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("pruned_graph_for_shape_inference",
@@ -2033,7 +2108,7 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
       std::unique_ptr<Graph> graph;
       if (send_node != nullptr) {
         TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
-            *pruned_graph, shape_refiner, recv_at_host_names,
+            *pruned_graph, back_edge_helper, shape_refiner, recv_at_host_names,
             node_images[send_node], library, &static_shape, &graph));
         if (graph == nullptr) {
           VLOG(2) << "Send node  " << send_node->name() << " shapes";
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 386240ff8d1a562ff4894c40ac79022b8c61fb15..8e2ee0f1d71bc17b4c12c792c38002af4f9eb5eb 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -51,6 +51,15 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
   if (node.type_string() == "SymbolicGradient") return false;
+  if (node.type_string() == "Const") {
+    // Skip Const op with type DT_STRING, since XLA doesn't support it, but the
+    // registered Const KernelDef says that it does, to support no-op Assert for
+    // tfcompile.
+    const AttrValue* attr = node.attrs().Find("dtype");
+    if (attr != nullptr && attr->type() == DT_STRING) {
+      return false;
+    }
+  }
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 80edaf28b83348e3a8071a6e9696bc3ebad5d70f..703d8825d74ced8d4d69c31ccd730adc89a8bffe 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -609,5 +609,29 @@ TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, ConstOp) {
+  // valid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), 0.5f);
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_EQ(1, GetClusters(*graph).size());
+  }
+
+  // invalid data type
+  {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope root = Scope::NewRootScope().ExitOnError();
+    auto c = ops::Const(root.WithOpName("const"), string("string"));
+    c.node()->AddAttr(kXlaCompileAttr, true);
+    TF_ASSERT_OK(root.ToGraph(graph.get()));
+    TF_ASSERT_OK(MarkForCompilation(&graph));
+    EXPECT_TRUE(GetClusters(*graph).empty());
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c8c04152d2f3a0fd46711df24756b7e68b967ea
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue.h
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+
+#include <deque>
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// A thread-safe, first-in-first-out queue.
+template <typename T>
+class ProducerConsumerQueue {
+ public:
+  ProducerConsumerQueue()
+      : capacity_(std::numeric_limits<std::size_t>::max()) {}
+  ~ProducerConsumerQueue() = default;
+
+  // Wait until the queue is non-full, then append a copy of v.
+  void Put(const T &v);
+
+  // Wait until the queue is non-empty, then remove and return the head value.
+  T Get();
+
+  // If the queue is non-empty, remove the head value, placing it in *pv, and
+  // return true; otherwise return false.
+  bool TryGet(T *pv);
+
+  // Set the capacity of the queue; the queue is full whenever count() >=
+  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
+  void set_capacity(std::size_t size);
+
+  // Return the capacity of the queue.
+  std::size_t capacity() const;
+
+  // Return the number of elements in the queue.
+  std::size_t count() const;
+
+  // Implementation details follow.  Clients should ignore.
+ private:
+  mutable tensorflow::mutex mu_;  // protects all fields below
+  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
+  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::deque<T> queue_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
+};
+
+// ------------------------------------------------------
+// Implementation details follow.  Clients should ignore.
+
+// Wait until the queue is non-full, then append a copy of v.
+template <typename T>
+void ProducerConsumerQueue<T>::Put(const T &v) {
+  mutex_lock lock(mu_);
+  while (queue_.size() >= capacity_) {
+    non_full_.wait(lock);
+  }
+  queue_.push_back(v);
+  non_empty_.notify_one();
+}
+
+// Wait until the queue is non-empty, then remove and return the head value.
+template <typename T>
+T ProducerConsumerQueue<T>::Get() {
+  mutex_lock lock(mu_);
+  while (queue_.empty()) {
+    non_empty_.wait(lock);
+  }
+  non_full_.notify_one();
+  T result_value = queue_.front();
+  queue_.pop_front();
+  return result_value;
+}
+
+// If the queue is non-empty, remove the head value, placing it in *pv, and
+// return true; otherwise return false.
+template <typename T>
+bool ProducerConsumerQueue<T>::TryGet(T *pv) {
+  mutex_lock lock(mu_);
+  bool got_element = !queue_.empty();
+  if (got_element) {
+    non_full_.notify_one();
+    *pv = queue_.front();
+    queue_.pop_front();
+  }
+  return got_element;
+}
+
+// Set the capacity of the queue; the queue is full whenever count() >=
+// capacity().  The initial value is the maximum size_t.  Requires size > 0.
+template <typename T>
+void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
+  mutex_lock lock(mu_);
+  CHECK_NE(size, 0);
+  capacity_ = size;
+  non_full_.notify_all();
+}
+
+// Return the capacity of the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::capacity() const {
+  mutex_lock lock(mu_);
+  std::size_t max_elements = capacity_;
+  return max_elements;
+}
+
+// Return the number of elements in the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::count() const {
+  mutex_lock lock(mu_);
+  std::size_t num_elements = queue_.size();
+  return num_elements;
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f61260c6e52756ee039829afdc7452f5f760c221
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/producer_consumer_queue.h"
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+typedef ProducerConsumerQueue<int> IntQueue;
+
+// Insert integers between low inclusive and high exclusive into q.
+void PushRange(IntQueue *q, int low, int high) {
+  while (low != high) {
+    q->Put(low);
+    VLOG(2) << "Pushing " << low;
+    ++low;
+  }
+}
+
+// Push the numbers between 0 and 999 inclusive from several threads in the
+// pool.
+void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
+  VLOG(1) << "Adding 20-36";
+  pool->Schedule([queue] { PushRange(queue, 20, 36); });
+  VLOG(1) << "Adding 7-20";
+  pool->Schedule([queue] { PushRange(queue, 7, 20); });
+  VLOG(1) << "Adding 36-501";
+  pool->Schedule([queue] { PushRange(queue, 36, 501); });
+  VLOG(1) << "Adding 501-1000";
+  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
+  VLOG(1) << "Adding 0-5";
+  pool->Schedule([queue] { PushRange(queue, 0, 5); });
+  VLOG(1) << "Adding 5-7";
+  pool->Schedule([queue] { PushRange(queue, 5, 7); });
+}
+
+// Pop elements from queue using Get().  Make sure that exactly <high> elements
+// were present and their values are all integers between 0 and high-1
+// inclusive.
+void GetRange(IntQueue *queue, int high) {
+  VLOG(1) << "Testing Wait";
+  std::vector<int> results;
+  for (int i = 0; i != high; ++i) {
+    int r = queue->Get();
+    VLOG(2) << "Waited and got " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK(results[i] == i);
+  }
+}
+
+// Pop elements from queue using TryGet().  Make sure that exactly <high>
+// elements were present and their values are all integers between 0 and high-1
+// inclusive.
+void TryGetRange(IntQueue *queue, int high) {
+  std::vector<int> results;
+  // Give up if we don't get all the elements back from the queue
+  // in 10 seconds.
+  int timeout = 10;
+  int r;
+  for (int i = 0; i != high; ++i) {
+    while (!queue->TryGet(&r)) {
+      if (!timeout--) {
+        LOG(FATAL) << "Can't find all elements in the queue";
+      }
+      VLOG(1) << "Sleeping for a second...";
+      sleep(1);
+    }
+    VLOG(2) << "Popped " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  CHECK(!queue->TryGet(&r));
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK_EQ(i, results[i]);
+  }
+}
+
+const int kNumThreads = 15;
+
+TEST(ProducerConsumerQueue, GetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  GetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, TryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  TryGetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, ParallelGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { GetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+TEST(ProducerConsumerQueue, ParallelTryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.cc b/tensorflow/compiler/jit/shape_inference_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9cfa16526bc5d809942a35e86075b4ec6e88a59
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains helpers for use in shape inference.
+
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+Status BackEdgeHelper::Remove(Graph* graph) {
+  if (graph_ != nullptr) {
+    return errors::Internal("BackEdgeHelper duplicate call to Remove.");
+  }
+  graph_ = graph;
+  for (Node* n : graph_->nodes()) {
+    if (n->IsMerge()) {
+      for (const Edge* e : n->in_edges()) {
+        if (e->src()->IsNextIteration()) {
+          back_edges_.push_back(
+              BackEdge{e, e->src(), e->src_output(), e->dst(), e->dst_input()});
+        }
+      }
+    }
+  }
+  for (const BackEdge& be : back_edges_) {
+    graph_->RemoveEdge(be.edge);
+  }
+  return Status::OK();
+}
+
+const std::vector<BackEdgeHelper::BackEdge>& BackEdgeHelper::RemovedEdges()
+    const {
+  return back_edges_;
+}
+
+Status BackEdgeHelper::Replace() {
+  if (graph_ == nullptr) {
+    return errors::Internal("BackEdgeHelper Replace called before Remove.");
+  }
+  if (replaced_) {
+    return errors::Internal("BackEdgeHelper Replace called more than once.");
+  }
+  replaced_ = true;
+  for (const BackEdge& be : back_edges_) {
+    graph_->AddEdge(be.src, be.src_output, be.dst, be.dst_input);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.h b/tensorflow/compiler/jit/shape_inference_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f053c9a45dd47ca1b056634d2248d6181e77d68
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_helpers.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
+
+#include <vector>
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Helper class to temporarily remove, then replace, the back edges in a
+// graph. Simple algorithms for shape inference don't work with cycles, and this
+// class can be used to remove cycles before running inference and replace them
+// after. Correct usage requires exactly one call to Remove(), followed by any
+// number of calls to RemovedEdges() and at most one call to Replace(). The call
+// to Replace() is optional if the graph will be discarded without being
+// executed, e.g., if it is being used purely for a shape inference pass.
+class BackEdgeHelper {
+ public:
+  struct BackEdge {
+    const Edge* edge;
+    Node* src;
+    int src_output;
+    Node* dst;
+    int dst_input;
+  };
+
+  BackEdgeHelper() = default;
+  // Disallows copy and assign.
+  BackEdgeHelper(const BackEdgeHelper& other) = delete;
+  BackEdgeHelper& operator=(const BackEdgeHelper& other) = delete;
+
+  // Temporarily removes all the back edges in graph.
+  Status Remove(Graph* graph);
+
+  // Gets the list of removed edges.
+  const std::vector<BackEdge>& RemovedEdges() const;
+
+  // Replaces the back edges removed by a prior call to Remove.
+  Status Replace();
+
+ private:
+  Graph* graph_ = nullptr;  // not owned
+  std::vector<BackEdge> back_edges_;
+  // Set once Replace has been called.
+  bool replaced_ = false;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_HELPERS_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index e345c1266a7a9399811f735ff1886e043879bb47..b9e42ca677cd82e2c18309d25ab33954206ebbe4 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -124,6 +124,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    tags = ["optonly"],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -199,6 +200,11 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    tags = [
+        # Allocates very large amounts of memory and does not work under TSAN.
+        "notsan",
+        "optonly",  # Times out frequently in fastbuild.
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -265,6 +271,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "dynamic_slice_ops_test",
+    size = "small",
+    srcs = ["dynamic_slice_ops_test.py"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -491,6 +509,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reduce_window_test",
+    size = "small",
+    srcs = ["reduce_window_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "reverse_ops_test",
     size = "medium",
@@ -683,6 +717,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "gather_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index d1d7379c0a32eff4ff96e791dacbe800bbd70b7d..1e4dd32916c3a40282735fb8f75670b0e9ef0dc9 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -360,11 +360,13 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
-    self._testBinary(
-        math_ops.add,
-        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
 
   def testComplexOps(self):
     for dtype in self.complex_types:
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index a9db1c173d33b0bc44248a4b55c678f7083b5527..7b114d4f85d3a5cadc6af25b55c5a21f90d2a768 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -51,12 +51,12 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     if backend == "cpu":
       backend_args += [
           "--test_device=XLA_CPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
       ]
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
diff --git a/tensorflow/compiler/tests/dynamic_slice_ops_test.py b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a46d2ec3e7aee3a4ecfbf1ab9f622d8eb659e3c
--- /dev/null
+++ b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
@@ -0,0 +1,93 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA dynamic slicing ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class DynamicUpdateSliceOpsTest(XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self, op, args, expected):
+    with self.test_session() as session:
+      with self.test_scope():
+        placeholders = [
+            array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
+            for arg in args
+        ]
+        feeds = {placeholders[i]: args[i] for i in range(0, len(args))}
+        output = op(*placeholders)
+      result = session.run(output, feeds)
+      self.assertAllClose(result, expected, rtol=1e-3)
+
+  def testUpdateSlice(self):
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([], dtype=dtype),
+              np.array([], dtype=dtype),
+              np.array([0], dtype=np.int32)
+          ],
+          expected=np.array([], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
+              np.array([-1, -2, -3], dtype=dtype),
+              np.array([6], dtype=np.int32)
+          ],
+          expected=np.array([1, 2, 3, 4, 5, 6, -1, -2, -3, 10], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[42, 43], [44, 45]], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 42, 43], [9, 10, 44, 45]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.array([[], []], dtype=dtype),
+              np.array([1, 2], dtype=np.int32)
+          ],
+          expected=np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          xla.dynamic_update_slice, [
+              np.array(
+                  [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=dtype),
+              np.ones([3, 4], dtype=dtype),
+              np.array([0, 0], dtype=np.int32)
+          ],
+          expected=np.ones([3, 4], dtype=dtype))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index f9db4cf2017c0b4b6dc0cfeeda6dca7bb9d14f19..8e6407dffdac3adbcda8cbca2109ef9196defa8c 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -134,9 +134,15 @@ class FtrlOptimizerTest(XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.60260963, -4.29698515]), var0.eval(), float_rtol=1e-5)
+            np.array([-2.60260963, -4.29698515]),
+            var0.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28432083, -0.56694895]), var1.eval(), float_rtol=1e-5)
+            np.array([-0.28432083, -0.56694895]),
+            var1.eval(),
+            float_rtol=1e-5,
+            half_rtol=1e-2)
 
   def testFtrlwithoutRegularization2(self):
     for dtype in self.float_types:
@@ -272,8 +278,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
-    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4, half_rtol=1e-2)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index 11d8a99ffe1a136a54b16e20f1792062203f7969..fbc3c994d163a504351fcccd1ba71a0997e6516f 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -105,6 +105,28 @@ class FunctionTest(XLATestCase):
       result = sess.run(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
+  def testCompileTimeConstantsInDefun(self):
+    """Tests that XLA handles compile-time constants in defuns."""
+    with self.test_session() as sess:
+
+      @function.Defun(dtypes.float32, dtypes.int32, dtypes.int32)
+      def Foo(a, c, d):
+        # c and d must be known at compile time
+        x = array_ops.slice(a, c, d)
+        return x
+
+      a = array_ops.placeholder(dtypes.float32)
+      c = array_ops.placeholder(dtypes.int32, shape=[4])
+      d = array_ops.placeholder(dtypes.int32, shape=[4])
+      with self.test_scope():
+        call_f = Foo(a, c, d)
+      result = sess.run(call_f, feed_dict={
+          a: np.ones([1, 4, 4, 1]),
+          c: [0, 0, 0, 0],
+          d: [1, 2, 2, 1]})
+
+    self.assertAllEqual(np.ones([1, 2, 2, 1]), result)
+
   # TODO(b/36139787): Re-enable this test when noinline works again.
   def DISABLED_testFunctionsNoInline(self):
 
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 3bc41b7cfd72bec7572097f8c53eef314a4369f6..12791ef8ac1da948608b1585f423ca217378f031 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -65,7 +65,8 @@ class RGBToHSVTest(XLATestCase):
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
       self.assertAllClose(batch2, join2)
-      self.assertAllCloseAccordingToType(batch2, inp, bfloat16_atol=0.03)
+      self.assertAllCloseAccordingToType(
+          batch2, inp, bfloat16_atol=0.03, half_rtol=0.02)
 
   def testRGBToHSVRoundTrip(self):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
diff --git a/tensorflow/compiler/tests/reduce_window_test.py b/tensorflow/compiler/tests/reduce_window_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78a63465b80644d8810d9fa7433653bc4639fed
--- /dev/null
+++ b/tensorflow/compiler/tests/reduce_window_test.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla.reduce_window."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class ReduceWindowTest(XLATestCase):
+  """Test cases for xla.reduce_window."""
+
+  def _reduce_window(self, operand, init, reducer, **kwargs):
+    with self.test_session():
+      placeholder = array_ops.placeholder(operand.dtype)
+      with self.test_scope():
+        output = xla.reduce_window(placeholder, init, reducer, **kwargs)
+      return output.eval(feed_dict={placeholder: operand})
+
+  def testReduceWindow(self):
+
+    # TODO(b/77644762): float16 and float64 ReduceWindow are unimplemented.
+    for dtype in set(self.numeric_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      @function.Defun(dtype, dtype)
+      def sum_reducer(x, y):
+        return x + y
+
+      @function.Defun(dtype, dtype)
+      def mul_reducer(x, y):
+        return x * y
+
+      self.assertAllClose(
+          np.array([3, 5, 7, 9, 11, 13], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2]))
+
+      self.assertAllClose(
+          np.array([3, 7, 11], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2],
+              window_strides=[2]))
+
+      self.assertAllClose(
+          np.array([1, 4, 7], dtype=dtype),
+          self._reduce_window(
+              np.array([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[1],
+              window_strides=[3]))
+
+      self.assertAllClose(
+          np.array([[24, 36, 24], [96, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              1.0,
+              mul_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[1, 1]))
+
+      self.assertAllClose(
+          np.array([[0, 0, 0], [5, 10, 5], [2, 4, 1], [0, 0, 0]], dtype=dtype),
+          self._reduce_window(
+              np.array([[1, 2, 3, 4], [4, 3, 2, 1], [2, 4, 0, 1]], dtype=dtype),
+              0.0,
+              sum_reducer,
+              window_dimensions=[2, 2],
+              window_strides=[2, 2],
+              padding=[[2, 3], [1, 2]]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 60839814931eaeb0b78a20fd1e4f387d241cd56f..f37c34156f96761632247be4bc1b62fca54f666e 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -163,14 +163,26 @@ class SpaceToBatchNDTest(XLATestCase):
         # error.
         if dtype == dtypes.bfloat16.as_numpy_dtype:
           continue
+        if dtype == np.float16:
+          actual_inputs = np.array(inputs).astype(dtype)
+          actual_paddings = np.array(paddings).astype(dtype)
+          expected_outputs = np.array(outputs).astype(dtype)
+        else:
+          actual_inputs = inputs
+          actual_paddings = paddings
+          expected_outputs = outputs
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
-        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: actual_inputs}), expected_outputs)
         # inputs = batch_to_space(outputs)
         placeholder = array_ops.placeholder(dtype)
-        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape, paddings)
-        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape,
+                                           actual_paddings)
+        self.assertAllEqual(
+            sess.run(x_tf, {placeholder: expected_outputs}), actual_inputs)
 
   def _testDirect(self, input_shape, block_shape, paddings):
     inputs = np.arange(np.prod(input_shape), dtype=np.float32)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 17149aa1c8edddadc504e916915a70f78abf8002..ba79f393a8f9b24ac506d2130957c38ecd442509 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,9 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+      if dtype == np.float16 and self.device == "XLA_CPU":
+        continue
       x = np.arange(-0.90, 0.90, 0.25)
       self._assertOpOutputMatchesExpected(
           math_ops.acos,
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f79eb27435cc954cebde4357c1d946a320f4ed75
--- /dev/null
+++ b/tensorflow/compiler/tests/while_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class WhileTest(XLATestCase):
+
+  def testSingletonLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32)
+    def loop_body(step):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return step_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32)
+    def loop_cond(step):
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10], rtol=1e-3)
+
+  def testCountingLoopHandrolled(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5, dtype=dtypes.float32)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.float32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result, [10, 15.0], rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result, [10, 0.0], rtol=1e-3)
+
+  def testCountingLoopHandrolledC64(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_body(step, rsum):
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      sum_out = rsum + constant_op.constant(1.5 + 2j, dtype=dtypes.complex64)
+      return step_out, sum_out
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.complex64)
+    def loop_cond(step, rsum):
+      del rsum
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      init_sum = array_ops.placeholder(dtypes.complex64, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, init_sum], loop_cond,
+                                      loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0, init_sum: 0.0})
+      self.assertAllClose(result[1], np.complex64(15 + 20j), rtol=1e-3)
+      no_iters_result = sess.run(loop_outputs, {init_index: 10, init_sum: 0.0})
+      self.assertAllClose(no_iters_result[1], np.complex64(0), rtol=1e-3)
+
+  def testLoopWithConstantOutput(self):
+    # Define a function for the loop body
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_body(step, x):
+      del x
+      step_out = step + constant_op.constant(1, dtype=dtypes.int32)
+      return (step_out, 7)
+
+    # Define a function for the loop condition
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def loop_cond(step, x):
+      del x
+      return step < 10
+
+    with self.test_session() as sess:
+      init_index = array_ops.placeholder(dtypes.int32, [])
+      with self.test_scope():
+        loop_outputs = xla.while_loop([init_index, 42], loop_cond, loop_body)
+
+      result = sess.run(loop_outputs, {init_index: 0})
+      self.assertAllClose(result, [10, 7], rtol=1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e7daf4e01c45c3705216fce7dd3db5baa0c261fc..ba5c3a14849cefcb680b03425232724ff32375a8 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -415,7 +415,7 @@ cc_library(
         "//tensorflow/compiler/jit:graph_to_functiondef",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
@@ -437,7 +437,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
-        "//tensorflow/compiler/tf2xla/cc:functional_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index c30bb9cacd48fb93ac359a6a25699ba6a74183c5..4f8bb8ad743afe69a6544c2ae0dc7309891b2df3 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -7,44 +7,20 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 
 tf_gen_op_wrapper_cc(
-    name = "functional_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/functional_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:functional_ops"],
+    name = "xla_ops_gen",
+    out_ops_file = "ops/xla_ops",
+    deps = ["//tensorflow/compiler/tf2xla/ops:xla_ops"],
 )
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["ops/functional_ops.cc"],
-    hdrs = ["ops/functional_ops.h"],
+    name = "xla_ops",
+    srcs = ["ops/xla_ops.cc"],
+    hdrs = ["ops/xla_ops.h"],
     deps = [
         "//tensorflow/cc:const_op",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_gen_op_wrapper_cc(
-    name = "sendrecv_ops_gen",
-    include_internal_ops = 1,
-    out_ops_file = "ops/sendrecv_ops",
-    deps = ["//tensorflow/compiler/tf2xla/ops:sendrecv_ops"],
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["ops/sendrecv_ops.cc"],
-    hdrs = ["ops/sendrecv_ops.h"],
-    deps = [
-        "//tensorflow/cc:const_op",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8b7beef83ec2ed0df780d6a9cb2a4bcf737d008b..16b9142cbf7d2afe99c22acbc32fb17c09b00081 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -901,6 +901,14 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
+          // TODO(b/77601805) remove this when outside_compilation supports
+          // control flow.
+          if (str_util::StrContains(src->name(), "outside_compilation") ||
+              str_util::StrContains(n->name(), "outside_compilation")) {
+            return errors::InvalidArgument(
+                "outside_compilation is not yet supported within TensorFlow "
+                "control flow constructs b/77601805");
+          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index bc7276c3afd5060d6faeceb4d479416299ecc5da..e494f42e8ed254ac0c7c7a23a13728d3f015e9d3 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/tf2xla/cc/ops/functional_ops.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index f1bc7d6af49a09f84ef251eaa1c3d684792d0c1e..579b66969990017688477443115cc4f61c18fe4a 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -29,6 +29,7 @@ tf_kernel_library(
         "cwise_ops.h",
         "depthtospace_op.cc",
         "diag_op.cc",
+        "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
         "extract_image_patches_op.cc",
@@ -56,6 +57,7 @@ tf_kernel_library(
         "pooling_ops.cc",
         "quantize_and_dequantize_op.cc",
         "random_ops.cc",
+        "reduce_window_op.cc",
         "reduction_ops.cc",
         "reduction_ops.h",
         "reduction_ops_common.cc",
@@ -103,7 +105,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/lib:while_loop",
-        "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -146,7 +148,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:framework",
@@ -162,7 +164,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:framework",
@@ -171,6 +173,23 @@ tf_kernel_library(
     ],
 )
 
+# Kernels that have a dummy (no-op) implementation.
+tf_kernel_library(
+    name = "xla_dummy_ops",
+    srcs = [
+        "assert_op.cc",
+        "check_numerics_op.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:logging_ops_op_lib",
+    ],
+    alwayslink = 1,
+)
+
 # Kernels that only work on CPU, because they use XLA custom calls.
 # Only link this when using the CPU backend for XLA.
 tf_kernel_library(
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af4ab5e8ef6e268226edc90515706405ac36858c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+// This TensorFlow op supports the Assert primitve.
+class AssertOp : public XlaOpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  ~AssertOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    static mutex mu(tensorflow::LINKER_INITIALIZED);
+    static int log_counter = 0;
+
+    mutex_lock l(mu);
+    if (log_counter < 20) {
+      ++log_counter;
+      LOG(WARNING) << "Ignoring Assert operator " << name();
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(AssertOp);
+};
+
+REGISTER_XLA_OP(Name("Assert"), AssertOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6061e822d8d9c6c807a63aad4e9e9526a49e456c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/check_numerics_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+
+class CheckNumericsOp : public XlaOpKernel {
+ public:
+  explicit CheckNumericsOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // TODO(b/32223192): add a real implementation of CheckNumerics
+    {
+      static mutex mu(tensorflow::LINKER_INITIALIZED);
+      static int log_counter = 0;
+      mutex_lock l(mu);
+      if (log_counter < 20) {
+        ++log_counter;
+        LOG(WARNING) << "Ignoring CheckNumerics operator " << name();
+      }
+    }
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CheckNumericsOp);
+};
+
+REGISTER_XLA_OP(Name("CheckNumerics"), CheckNumericsOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..800ef5ab98d70ad822c6efffb33db28b46ae50fe
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class DynamicUpdateSliceOp : public XlaOpKernel {
+ public:
+  explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(3) << "DynamicUpdateSliceOp::Compile";
+
+    DataType index_type = input_type(2);
+    OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("index must be int32 or int64"));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape update_shape = ctx->InputShape(1);
+    const TensorShape index_shape = ctx->InputShape(2);
+
+    OP_REQUIRES(
+        ctx,
+        TensorShapeUtils::IsVector(index_shape) &&
+            index_shape.num_elements() == input_shape.dims(),
+        errors::InvalidArgument("index must be a vector with length equal to "
+                                "the number of input dimensions"));
+    OP_REQUIRES(
+        ctx, input_shape.dims() == update_shape.dims(),
+        errors::InvalidArgument("input and update must have the same rank,"
+                                " input shape is ",
+                                input_shape.DebugString(), "; update shape is ",
+                                update_shape.DebugString()));
+
+    xla::ComputationDataHandle result = ctx->builder()->DynamicUpdateSlice(
+        ctx->Input(0), ctx->Input(1), ctx->Input(2));
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaDynamicUpdateSlice"), DynamicUpdateSliceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb144bea9e429b7c8bcc3d07f688ed6a254c3be0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/while_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class ReduceWindowOp : public XlaOpKernel {
+ public:
+  explicit ReduceWindowOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("computation", &computation_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_dimensions", &window_dimensions_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("window_strides", &window_strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_low", &padding_low_));
+    OP_REQUIRES_OK(context, context->GetAttr("padding_high", &padding_high_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const DataType dtype = context->input_type(0);
+
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, rank == window_dimensions_.size(),
+                errors::InvalidArgument(
+                    "The size of window_dimensions must be equal to the input "
+                    "rank (",
+                    window_dimensions_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_strides_.size(),
+                errors::InvalidArgument(
+                    "The size of window_strides must be equal to the input "
+                    "rank (",
+                    window_strides_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_low_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_low must be equal to the input "
+                    "rank (",
+                    padding_low_.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_high_.size(),
+                errors::InvalidArgument(
+                    "The size of padding_high must be equal to the input "
+                    "rank (",
+                    padding_high_.size(), " vs. ", rank, ")"));
+
+    xla::ComputationBuilder* builder = context->builder();
+
+    // Build the reducer function.
+    XlaCompiler::Argument reducer_arg;
+    reducer_arg.kind = XlaCompiler::Argument::kParameter;
+    reducer_arg.type = dtype;
+    reducer_arg.shape = TensorShape();
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.use_tuple_arg = false;
+    compile_options.resolve_compile_time_constants = false;
+    compile_options.is_entry_computation = false;
+    XlaCompiler::CompilationResult reducer;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *computation_,
+                                {reducer_arg, reducer_arg}, &reducer));
+
+    xla::Shape scalar_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(context,
+                xla::ShapeUtil::Compatible(
+                    reducer.xla_output_shape,
+                    xla::ShapeUtil::MakeTupleShape({scalar_shape})),
+                errors::InvalidArgument(
+                    "Invalid output shape of ReduceWindow reducer. Expected ",
+                    xla::ShapeUtil::HumanString(scalar_shape), " got ",
+                    xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
+
+    // Wraps the reducer in a computation that unpacks the output tuple.
+    xla::Computation wrapper;
+    {
+      std::unique_ptr<xla::ComputationBuilder> cb =
+          builder->CreateSubBuilder("wrapper");
+      auto x = cb->Parameter(0, scalar_shape, "x");
+      auto y = cb->Parameter(1, scalar_shape, "y");
+      auto outputs = cb->Call(*reducer.computation, {x, y});
+      cb->GetTupleElement(outputs, 0);
+      xla::StatusOr<xla::Computation> result = cb->Build();
+      OP_REQUIRES_OK(context, result.status());
+      wrapper = std::move(result.ValueOrDie());
+    }
+
+    std::vector<std::pair<int64, int64>> padding(rank);
+    for (int i = 0; i < rank; ++i) {
+      padding[i] = {padding_low_[i], padding_high_[i]};
+    }
+
+    xla::ComputationDataHandle output = builder->ReduceWindowWithGeneralPadding(
+        context->Input(0), context->Input(1), wrapper, window_dimensions_,
+        window_strides_, padding);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  const NameAttrList* computation_;
+  std::vector<int64> window_dimensions_;
+  std::vector<int64> window_strides_;
+  std::vector<int64> padding_low_;
+  std::vector<int64> padding_high_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ReduceWindowOp);
+};
+
+REGISTER_XLA_OP(Name("XlaReduceWindow"), ReduceWindowOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index 5172781c0d05b6682fe92086654e3b86961949ee..d079b89861817a5639ac72b5ee49d76cb4506ae8 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -48,7 +48,7 @@ void SendOp::Compile(XlaOpKernelContext* ctx) {
   ctx->builder()->Send(ctx->Input(0), channel);
 }
 
-REGISTER_XLA_OP(Name("_XLASend"), SendOp);
+REGISTER_XLA_OP(Name("XlaSend"), SendOp);
 
 class RecvOp : public XlaOpKernel {
  public:
@@ -68,7 +68,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   TensorShape tensor_shape;
   DataType dtype;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tensor_shape));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype));
   OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tensor_shape, &shape_));
 }
 
@@ -79,7 +79,7 @@ void RecvOp::Compile(XlaOpKernelContext* ctx) {
   ctx->SetOutput(0, ctx->builder()->Recv(shape_, channel));
 }
 
-REGISTER_XLA_OP(Name("_XLARecv"), RecvOp);
+REGISTER_XLA_OP(Name("XlaRecv"), RecvOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index aeb743a6634673f2e8c4dee9ae1e5017944aae2c..bb9168fa358154f3db9dab87bacc9bf28dd16406 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -7,17 +7,13 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 cc_library(
-    name = "functional_ops",
-    srcs = ["functional_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
+    name = "xla_ops",
+    srcs = [
+        "dynamic_slice_ops.cc",
+        "functional_ops.cc",
+        "reduce_window_op.cc",
+        "sendrecv_ops.cc",
     ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "sendrecv_ops",
-    srcs = ["sendrecv_ops.cc"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -25,17 +21,9 @@ cc_library(
 )
 
 tf_gen_op_wrapper_py(
-    name = "gen_functional_ops",
-    out = "gen_functional_ops.py",
-    deps = [
-        ":functional_ops",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_sendrecv_ops",
-    out = "gen_sendrecv_ops.py",
+    name = "gen_xla_ops",
+    out = "gen_xla_ops.py",
     deps = [
-        ":sendrecv_ops",
+        ":xla_ops",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6c0edbb889b1751ac9d9d47d0c9534b543196ff
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaDynamicUpdateSlice")
+    .Input("input: T")
+    .Input("update: T")
+    .Input("indices: Tindices")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Wraps the XLA DynamicUpdateSlice operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice
+.
+
+XlaDynamicUpdateSlice generates a result which is the value of the `input`
+operand, with a slice update overwritten at `indices`. The shape of `update`
+determines the shape of the sub-array of the result which is updated. The shape
+of indices must be rank == 1, with dimension size equal to the rank of `input`.
+
+Handling of out-of-bounds slice indices is implementation-defined.
+
+input: A `Tensor` of type T.
+indices: A vector of indices into `input`. Must have length equal to the rank of
+  `input`.
+update: A `Tensor` of type T. Same rank as `input`.
+output: A `Tensor` of type T.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9af982adc090ea78c711fd4656ba429c53b18c9
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("XlaReduceWindow")
+    .Input("input: T")
+    .Input("init_value: T")
+    .Attr("T: numbertype")
+    .Attr("computation: func")
+    .Attr("window_dimensions: list(int)")
+    .Attr("window_strides: list(int)")
+    .Attr("padding_low: list(int)")
+    .Attr("padding_high: list(int)")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Wraps the XLA ReduceWindow operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+input: the input tensor
+init_value: a scalar representing the initial value for the reduction
+computation: a reducer function to apply
+window_dimensions: the shape of the window
+window_strides: the inter-window strides
+padding_low: the padding to apply at the start of each input dimensions
+padding_high: the padding to apply at the end of each input dimension.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
index 4b41c16a8b3fdc0c3412c76d29d3ec2b7bdfd0aa..7ec7b50e905a6cbdecea4543dcb87322b5a7e844 100644
--- a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc
@@ -18,22 +18,24 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_XLASend")
+REGISTER_OP("XlaSend")
     .Input("tensor: T")
     .Attr("T: type")
     .Attr("tensor_name: string")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Sends the named tensor to another XLA computation.
+Sends the named tensor to another XLA computation. Wraps the XLA Send operator
+documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#send .
 
 tensor: The tensor to send.
-tensor_name: The name of the tensor to send.
+tensor_name: A string key that identifies the channel.
 )doc");
 
-REGISTER_OP("_XLARecv")
-    .Output("tensor: T")
-    .Attr("T: type")
+REGISTER_OP("XlaRecv")
+    .Output("tensor: dtype")
+    .Attr("dtype: type")
     .Attr("tensor_name: string")
     .Attr("shape: shape")
     .SetIsStateful()
@@ -46,11 +48,14 @@ REGISTER_OP("_XLARecv")
       return Status::OK();
     })
     .Doc(R"doc(
-Receives the named tensor from another XLA computation.
+Receives the named tensor from another XLA computation. Wraps the XLA Recv
+operator documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#recv .
 
 tensor: The tensor to receive.
-tensor_name: The name of the tensor to receive.
-shape: The shape of the input tensor.
+dtype: The type of the tensor.
+tensor_name: A string key that identifies the channel.
+shape: The shape of the tensor.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index f0a2ef0651ff6115bd201a3b1c34b3c061a22a3d..42b6292f79ffddd155c05758a1420a2a583eb0c6 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -22,3 +22,11 @@ tf_py_clif_cc(
         "//tensorflow/compiler/tf2xla:xla_compiler",
     ],
 )
+
+py_library(
+    name = "xla",
+    srcs = ["xla.py"],
+    deps = [
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ce65bec950fdfd38c3ca5bc62ac745ef8ca4a7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental library that exposes XLA operations directly in TensorFlow.
+
+It is sometimes useful to be able to build HLO programs directly from
+TensorFlow. This file provides Tensorflow operators that map as closely as
+possible to HLO operators.
+
+There is no promise of backward or forward compatibility for operators defined
+in this module.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
+
+# TODO(phawkins): provide wrappers for all XLA operators.
+
+dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
+
+
+def reduce_window(operand,
+                  init,
+                  reducer,
+                  window_dimensions,
+                  window_strides=None,
+                  padding=None,
+                  name=None):
+  """Wraps the XLA ReduceWindow operator.
+
+  ReduceWindow is documented at
+  https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
+
+  Args:
+    operand: the input tensor
+    init: a scalar tensor representing the initial value for the reduction
+    reducer: a reduction function that combines a pair of scalars.
+    window_dimensions: shape of the window, as a list of integers
+    window_strides: inter-window strides, as a list of integers. Optional;
+      if omitted, defaults to strides of 1.
+    padding: padding to apply to 'operand'. List of (low, high) pairs of
+      integers that specify the padding to apply before and after each
+      dimension. Optional; if omitted, defaults to no padding.
+    name: the operator name, or None.
+  Returns:
+    A tensor that represents the output of the reduce_window operator.
+  """
+  window_strides = window_strides or [1] * len(window_dimensions)
+  padding = padding or [(0, 0)] * len(window_dimensions)
+  padding_low = [x for (x, _) in padding]
+  padding_high = [y for (_, y) in padding]
+  return gen_xla_ops.xla_reduce_window(
+      operand,
+      init,
+      reducer,
+      window_dimensions,
+      window_strides,
+      padding_low,
+      padding_high,
+      name=name)
+
+
+recv = gen_xla_ops.xla_recv
+send = gen_xla_ops.xla_send
+
+while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index a9978e697b091715ce120f0d18fdddd259e08b32..b813668a9edd3a704a9dca1eaa588c1eced6ac31 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -90,6 +90,11 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(result_or.status());
   std::unique_ptr<xla::Literal> result = std::move(result_or.ValueOrDie());
   EXPECT_EQ("(s32[]) (\n42\n)", result->ToString());
+
+  config.mutable_feed(0)->mutable_id()->set_output_index(
+      123); /* invalid output_index */
+  EXPECT_TRUE(errors::IsInvalidArgument(
+      ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index f428a194328935fec1210ea96245344de859e611..7ec85aa3cdec622cae509f45c5ba7740222025f9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -151,8 +151,15 @@ Status AddPlaceholdersForFeeds(
       Status status;
       Node* feed_node = g.AddNode(gd.node(0), &status);
       TF_RETURN_IF_ERROR(status);
-      info.data_type =
-          BaseType(feed_node->output_type(info.feed->id().output_index()));
+
+      if (info.feed->id().output_index() < feed_node->num_outputs()) {
+        info.data_type =
+            BaseType(feed_node->output_type(info.feed->id().output_index()));
+      } else {
+        return errors::InvalidArgument(
+            "Invalid output_index ", info.feed->id().output_index(),
+            " for feed node ", info.feed->id().node_name());
+      }
     }
   }
 
@@ -281,4 +288,13 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
   return Status::OK();
 }
 
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef) {
+  for (KernelDef::AttrConstraint& constraint : *kdef->mutable_constraint()) {
+    if (constraint.name() == name) {
+      constraint.mutable_allowed_values()->mutable_list()->add_type(dtype);
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index e5fba8ede7745febbb42c572a7b52247213afc95..745beb39c1d917cd0d1cd219536ee26a96253ec9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -51,6 +52,10 @@ string TensorIdToString(const tf2xla::TensorId& id);
 // edges are considered.
 Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
 
+// Add an allowed data type to the AttrConstraint with the given name.
+void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
+                                   KernelDef* kdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
index 8286480e0ea07429adbe31ec4f16d043e321df0a..ead229aaccc292d4944db0c1eaf98c82583533cd 100644
--- a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -30,6 +31,12 @@ bool CpuOpFilter(KernelDef* kdef) {
         DT_FLOAT);
     return true;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index 8ca757e72355d890c13b8b448d35c327d3986696..62168b648331844bfe2db1a4d5dcad895c8726f3 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
@@ -25,6 +26,12 @@ bool GpuOpFilter(KernelDef* kdef) {
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
   }
+  if (kdef->op() == "Const") {
+    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (kdef->op() == "Assert") {
+    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 3b0b2f06ebae4af918cbe6fb8a384004c1858998..62a5114837e07f35134ad99e28880d6a9233a213 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -122,6 +122,9 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_HALF:
+      return b->ConstantR0<Eigen::half>(
+          static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
     case DT_BFLOAT16:
       return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 3f45167fcb77cd3085c9645fba0b2901329c4bb2..f0f94298a05f7c4bdc41cbfb8572454fbedd371d 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -193,6 +193,34 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
+    const XlaComputation& computation, const Layout* output_layout) const {
+  ComputeConstantGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+  if (output_layout != nullptr) {
+    *request.mutable_output_layout() = *output_layout;
+  }
+
+  ComputeConstantResponse response;
+
+  VLOG(2) << "making compute-constant-graph request";
+  Status s = stub_->ComputeConstantGraph(&request, &response);
+  VLOG(2) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
+
+  if (!response.has_literal()) {
+    return InternalError(
+        "no computed literal in the provided response in ComputeConstantGraph "
+        "request");
+  }
+  return Literal::CreateFromProto(response.literal());
+}
+
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 05d707dab1533f44ce827157e888720e218d4c9c..14c685d94ea31c382d84223ca4e2eba544420d78 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -194,6 +194,27 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Computes the value of the given computation using a non-optimized
+  // interpreter on the host.
+  //
+  // The computation must not depend on any parameters, or on stateful operators
+  // such as `RngNormal` or `Infeed`.
+  //
+  // This functionality can be useful when translating a computation into XLA
+  // where something that looked dynamic is required by XLA to be specified as a
+  // constant. E.g. the source computation (outside of XLA) may include a
+  // dynamic computation of the shape of something and ComputeConstant lets you
+  // determine what the value of that computation is in the case where the value
+  // can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation will be
+  // stored using that layout.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
+      const XlaComputation& computation,
+      const Layout* output_layout = nullptr) const;
+
   // Unregister the memory for the given GlobalData on the device.
   Status Unregister(const GlobalData& data);
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index f4673a8204f27e93441c73f6dcc9130d96cfcebc..59c4a53c05a45490a7c8e732840a4e70767c46c2 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index b63a1465ea755b906853860d47768ecbeaa0dcdd..311dc4bdd72cfd7999e83a26e11614d6ca005bce 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -111,4 +111,20 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
   return fake_arguments;
 }
 
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client) {
+  CHECK(computation.proto().has_program_shape())
+      << "Computation should have progran shape.";
+  auto program_shape = computation.proto().program_shape();
+
+  // For every (unbound) parameter that the computation wants, we manufacture
+  // some arbitrary data so that we can invoke the computation.
+  std::vector<std::unique_ptr<GlobalData>> fake_arguments;
+  for (const Shape& parameter : program_shape.parameters()) {
+    fake_arguments.push_back(MakeFakeDataOrDie(parameter, client));
+  }
+
+  return fake_arguments;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 7e640d1307edcc3e2c021f4391c456f578a015ee..1dc2622972d5fd3da6991d70b800cc3fd5a638f4 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -38,6 +39,12 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const Computation& computation, Client* client);
 
+// Returns vector of GlobalData handles of fake data (created using
+// MakeFakeDataOrDie) that are correctly shaped arguments for the given
+// xla computation.
+std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
+    const XlaComputation& computation, Client* client);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TESTING_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index b1dba168565cca86cba0403604736fecd00d6f29..31fa1241ee474a31575c45cf7652063dfc818fac 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -44,6 +44,7 @@ cc_library(
     hdrs = ["xla_builder.h"],
     deps = [
         ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 2d587cc3b9c51d5bd81652d17b23d4ad05c84dd3..7ccdc2ded2c099690bc9187936db6491ef4142dd 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <functional>
 #include <numeric>
+#include <queue>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -82,7 +85,7 @@ StatusOr<Shape> XlaOp::GetShape() const {
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
-    : name_(computation_name), unique_id_(GetUniqueId()) {}
+    : name_(computation_name) {}
 
 XlaBuilder::~XlaBuilder() {}
 
@@ -111,10 +114,11 @@ XlaOp XlaBuilder::NoteErrorOrReturn(
   return op.ConsumeValueOrDie();
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_RET_CHECK(root_id != nullptr);
+
   ProgramShape program_shape;
 
   // Not all instructions can be roots. Walk backwards from the last added
@@ -155,9 +159,56 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
   return program_shape;
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape() {
-  int64 root_id;
-  return GetProgramShape(&root_id);
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+  int64 root;
+  return GetProgramShape(&root);
+}
+
+void XlaBuilder::IsConstantVisitor(const int64 op_handle,
+                                   std::set<int64>* visited,
+                                   bool* is_constant) const {
+  if (visited->count(op_handle) != 0 || !*is_constant) {
+    return;
+  }
+
+  CHECK(op_handle < instructions_.size() && op_handle >= 0);
+
+  const HloInstructionProto& instr = instructions_[op_handle];
+  const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie();
+  switch (opcode) {
+    default:
+      for (const int64 operand_id : instr.operand_ids()) {
+        IsConstantVisitor(operand_id, visited, is_constant);
+      }
+      // TODO(b/32495713): We aren't checking the called computations.
+      break;
+
+    // Non functional ops.
+    case HloOpcode::kRng:
+    case HloOpcode::kCrossReplicaSum:
+      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kHostCompute:
+    case HloOpcode::kCall:
+      // TODO(b/32495713): We aren't checking the to_apply computation itself,
+      // so we conservatively say that computations containing the Call op
+      // cannot be constant.  We cannot set is_functional=false in other similar
+      // cases since we're already relying on IsConstant to return true.
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+      // TODO(b/32495713): We aren't checking the condition and body
+      // computations themselves.
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kParameter:
+      *is_constant = false;
+      break;
+  }
+  if (!*is_constant) {
+    VLOG(1) << "Non-constant: " << instr.name();
+  }
+  visited->insert(op_handle);
 }
 
 XlaComputation XlaBuilder::BuildAndNoteError() {
@@ -180,21 +231,24 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
   }
 
   HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
 
   {
     int64 root_id;
-    ProgramShape program_shape;
-    TF_ASSIGN_OR_RETURN(program_shape, GetProgramShape(&root_id));
-    entry.mutable_program_shape()->Swap(&program_shape);
+    TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(),
+                        GetProgramShape(&root_id));
     entry.set_root_id(root_id);
   }
 
   for (auto& instruction : instructions_) {
+    // Ensures that the instruction names are unique among the whole graph.
+    const string& new_name =
+        StrCat(instruction.name(), ".", entry.id(), ".", instruction.id());
+    instruction.set_name(new_name);
     entry.add_instructions()->Swap(&instruction);
   }
 
-  entry.set_id(unique_id_);
-  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
   XlaComputation computation(entry.id());
   HloModuleProto* module = computation.mutable_proto();
   module->set_name(entry.name());
@@ -417,11 +471,10 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
                             const string& name) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    if (parameter_numbers_.find(parameter_number) != parameter_numbers_.end()) {
+    if (!parameter_numbers_.insert(parameter_number).second) {
       return InvalidArgument("parameter %lld already registered",
                              parameter_number);
     }
-    parameter_numbers_.insert(parameter_number);
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
     *instr.mutable_shape() = shape;
@@ -485,7 +538,17 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
 
 XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> limits(shape.dimensions().begin(),
+                              shape.dimensions().end());
+    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    starts[dimno] = start_index;
+    limits[dimno] = limit_index;
+    strides[dimno] = stride;
+    return Slice(operand, starts, limits, strides);
+  });
 }
 
 XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
@@ -548,7 +611,22 @@ XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
 
 XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
                       const PaddingConfig& padding_config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
+                        GetShape(padding_value));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
+                                      padding_config));
+
+    *instr.mutable_padding_config() = padding_config;
+
+    return AddInstruction(std::move(instr), HloOpcode::kPad,
+                          {operand, padding_value});
+  });
 }
 
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
@@ -578,7 +656,45 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 
 XlaOp XlaBuilder::Collapse(const XlaOp& operand,
                            tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (dimensions.size() <= 1) {
+      // Not collapsing anything, trivially we can return the operand versus
+      // enqueueing a trivial reshape.
+      return operand;
+    }
+
+    // Out-of-order collapse is not supported.
+    // Checks that the collapsed dimensions are in order and consecutive.
+    for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
+         i < dimensions.size(); ++i) {
+      if (dimensions[i] - 1 != dimensions[i - 1]) {
+        return InvalidArgument(
+            "Collapsed dimensions are not in consecutive order.");
+      }
+    }
+
+    // Create a new sizes vector from the old shape, replacing the collapsed
+    // dimensions by the product of their sizes.
+    TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand));
+
+    VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape);
+    VLOG(3) << "dims to collapse: "
+            << tensorflow::str_util::Join(dimensions, ",");
+
+    std::vector<int64> new_sizes;
+    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+      if (i <= dimensions.front() || i > dimensions.back()) {
+        new_sizes.push_back(original_shape.dimensions(i));
+      } else {
+        new_sizes.back() *= original_shape.dimensions(i);
+      }
+    }
+
+    VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
+            << "]";
+
+    return Reshape(operand, new_sizes);
+  });
 }
 
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
@@ -684,24 +800,101 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
   });
 }
 
+Status XlaBuilder::VerifyConvolution(
+    const Shape& lhs_shape, const Shape& rhs_shape,
+    const ConvolutionDimensionNumbers& dimension_numbers) const {
+  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+    return InvalidArgument(
+        "Convolution arguments must have same number of "
+        "dimensions. Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_dims = ShapeUtil::Rank(lhs_shape);
+  if (num_dims < 2) {
+    return InvalidArgument(
+        "Convolution expects argument arrays with >= 3 dimensions. "
+        "Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_spatial_dims = num_dims - 2;
+
+  const auto check_spatial_dimensions =
+      [&](const char* const field_name,
+          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
+              numbers) {
+        if (numbers.size() != num_spatial_dims) {
+          return InvalidArgument("Expected %d elements for %s, but got %d.",
+                                 num_spatial_dims, field_name, numbers.size());
+        }
+        for (int i = 0; i < numbers.size(); ++i) {
+          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
+            return InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
+                                   field_name, i, numbers.Get(i));
+          }
+        }
+        return Status::OK();
+      };
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("input_spatial_dimensions",
+                               dimension_numbers.input_spatial_dimensions()));
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("kernel_spatial_dimensions",
+                               dimension_numbers.kernel_spatial_dimensions()));
+  return check_spatial_dimensions(
+      "output_spatial_dimensions",
+      dimension_numbers.output_spatial_dimensions());
+}
+
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-  return UnimplementedOp();
+  return ConvWithGeneralDimensions(
+      lhs, rhs, window_strides, padding,
+      CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return ConvGeneral(lhs, rhs, window_strides, padding,
+                     CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> base_area_dimensions(
+        dimension_numbers.input_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
+         ++i) {
+      base_area_dimensions[i] =
+          lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i));
+    }
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+
+    return ConvGeneral(lhs, rhs, window_strides,
+                       MakePadding(base_area_dimensions, window_dimensions,
+                                   window_strides, padding),
+                       dimension_numbers);
+  });
 }
 
 XlaOp XlaBuilder::ConvGeneral(
@@ -709,7 +902,8 @@ XlaOp XlaBuilder::ConvGeneral(
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
+                            dimension_numbers);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -719,33 +913,174 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   lhs_dilation, rhs_dilation));
+
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
+                                           dimension_numbers));
+
+    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+
+    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
+                          {lhs, rhs});
+  });
+}
+
+StatusOr<Window> XlaBuilder::MakeWindow(
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation) const {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
+    if (x == 0 || x == window_dimensions.size()) {
+      return Status::OK();
+    } else {
+      return InvalidArgument(
+          "%s", tensorflow::strings::StrCat(
+                    "Window has different number of window dimensions than of ",
+                    x_name,
+                    "\nNumber of window dimensions: ", window_dimensions.size(),
+                    "\nNumber of ", x_name, ": ", x, "\n")
+                    .c_str());
+    }
+  };
+  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
+  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
+  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
+  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
+
+  Window window;
+  for (size_t i = 0; i < window_dimensions.size(); i++) {
+    auto dim = window.add_dimensions();
+    dim->set_size(window_dimensions[i]);
+    if (!window_strides.empty()) {
+      dim->set_stride(window_strides[i]);
+    } else {
+      dim->set_stride(1);
+    }
+    if (!padding.empty()) {
+      dim->set_padding_low(padding[i].first);
+      dim->set_padding_high(padding[i].second);
+    } else {
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+    }
+    if (!lhs_dilation.empty()) {
+      dim->set_base_dilation(lhs_dilation[i]);
+    } else {
+      dim->set_base_dilation(1);
+    }
+    if (!rhs_dilation.empty()) {
+      dim->set_window_dilation(rhs_dilation[i]);
+    } else {
+      dim->set_window_dilation(1);
+    }
+    dim->set_window_reversal(false);
+  }
+  return window;
 }
 
 XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const tensorflow::gtl::ArraySlice<int64> fft_length) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
+
+    instr.set_fft_type(fft_type);
+    for (int64 i : fft_length) {
+      instr.add_fft_length(i);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (!LayoutUtil::HasLayout(shape)) {
+      return InvalidArgument("Given shape to Infeed must have a layout");
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_infeed_config(config);
+    return AddInstruction(std::move(instr), HloOpcode::kInfeed);
+  });
 }
 
 void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                          const string& outfeed_config) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+
+    // Check and set outfeed shape.
+    if (!LayoutUtil::HasLayout(shape_with_layout)) {
+      return InvalidArgument("Given shape to Outfeed must have a layout");
+    }
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) {
+      return InvalidArgument(
+          "Outfeed shape %s must be compatible with operand shape %s",
+          ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(),
+          ShapeUtil::HumanStringWithLayout(operand_shape).c_str());
+    }
+    *instr.mutable_outfeed_shape() = shape_with_layout;
+
+    instr.set_outfeed_config(outfeed_config);
+
+    return AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand});
+  });
 }
 
 XlaOp XlaBuilder::CustomCall(const string& call_target_name,
                              tensorflow::gtl::ArraySlice<XlaOp> operands,
                              const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (tensorflow::str_util::StartsWith(call_target_name, "$")) {
+      return InvalidArgument(
+          "Invalid custom_call_target \"%s\": Call targets that start with '$' "
+          "are reserved for internal use.",
+          call_target_name.c_str());
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_custom_call_target(call_target_name);
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+  });
 }
 
 XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
                               const string& channel_name,
                               int64 cost_estimate_ns, const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape;
+    instr.set_channel_name(channel_name);
+    instr.set_cost_estimate_ns(cost_estimate_ns);
+    return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands);
+  });
 }
 
 XlaOp XlaBuilder::Complex(
@@ -754,7 +1089,9 @@ XlaOp XlaBuilder::Complex(
   return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::Conj(const XlaOp& operand) { return UnimplementedOp(); }
+XlaOp XlaBuilder::Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
@@ -897,7 +1234,17 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
 
 XlaOp XlaBuilder::Rev(const XlaOp& operand,
                       tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    for (int64 dim : dimensions) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Sort(const XlaOp& operand) {
@@ -928,7 +1275,15 @@ XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
 
 XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SquareF32(const XlaOp& operand) {
@@ -954,7 +1309,28 @@ XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
                       const XlaComputation& computation,
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       tensorflow::gtl::ArraySlice<XlaOp> static_operands) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!static_operands.empty()) {
+      return Unimplemented("static_operands is not supported in Map");
+    }
+
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
+                                      dimensions));
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kMap, operands);
+  });
 }
 
 XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
@@ -1020,14 +1396,57 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
 XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
+    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
+                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
+                                         dimension_numbers, window_bounds));
+
+    *instr.mutable_gather_dimension_numbers() = dimension_numbers;
+    for (int64 bound : window_bounds) {
+      instr.add_gather_window_bounds(bound);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kGather,
+                          {input, gather_indices});
+  });
 }
 
 XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                               const XlaComputation& true_computation,
                               const XlaOp& false_operand,
                               const XlaComputation& false_computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate));
+    TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape,
+                        GetShape(true_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape,
+                        true_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape,
+                        GetShape(false_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
+                        false_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConditionalShape(
+            predicate_shape, true_operand_shape, false_operand_shape,
+            true_computation_shape, false_computation_shape));
+
+    // The index of true_computation must be 0 and that of false computation
+    // must be 1.
+    AddCalledComputation(true_computation, &instr);
+    AddCalledComputation(false_computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConditional,
+                          {predicate, true_operand, false_operand});
+  });
 }
 
 XlaOp XlaBuilder::Reduce(
@@ -1059,7 +1478,12 @@ XlaOp XlaBuilder::Reduce(
 
 XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                             const XlaComputation& computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
+    return Reduce(operand, init_value, computation, all_dimnos);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindow(
@@ -1067,7 +1491,21 @@ XlaOp XlaBuilder::ReduceWindow(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_RETURN_IF_ERROR(
+        ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()),
+                              window_dimensions, window_strides));
+
+    std::vector<std::pair<int64, int64>> padding_values =
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding);
+    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
+                                          window_dimensions, window_strides,
+                                          padding_values);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
@@ -1076,31 +1514,111 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
+                                               instr.window(), to_apply_shape));
+
+    AddCalledComputation(computation, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
+                          {operand, init_value});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                                     const XlaOp& offset, float epsilon,
                                     int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferBatchNormTrainingShape(
+            operand_shape, scale_shape, offset_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining,
+                          {operand, scale, offset});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                                      const XlaOp& offset, const XlaOp& mean,
                                      const XlaOp& variance, float epsilon,
                                      int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
+    TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormInferenceShape(
+                            operand_shape, scale_shape, offset_shape,
+                            mean_shape, variance_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference,
+                          {operand, scale, offset, mean, variance});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                                 const XlaOp& batch_mean, const XlaOp& batch_var,
                                 const XlaOp& grad_output, float epsilon,
                                 int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
+    TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormGradShape(
+                            operand_shape, scale_shape, batch_mean_shape,
+                            batch_var_shape, grad_output_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad,
+                          {operand, scale, batch_mean, batch_var, grad_output});
+  });
 }
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+
+    return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatter(
@@ -1109,7 +1627,14 @@ XlaOp XlaBuilder::SelectAndScatter(
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    return SelectAndScatterWithGeneralPadding(
+        operand, select, window_dimensions, window_strides,
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding),
+        source, init_value, scatter);
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
@@ -1119,12 +1644,45 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape,
+                        select.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape,
+                        scatter.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferSelectAndScatterShape(
+                            operand_shape, select_shape, instr.window(),
+                            source_shape, init_shape, scatter_shape));
+
+    AddCalledComputation(select, &instr);
+    AddCalledComputation(scatter, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter,
+                          {operand, source, init_value});
+  });
 }
 
 XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                   const int mantissa_bits) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReducePrecisionShape(
+                            operand_shape, exponent_bits, mantissa_bits));
+    instr.set_exponent_bits(exponent_bits);
+    instr.set_mantissa_bits(mantissa_bits);
+    return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
+                          {operand});
+  });
 }
 
 void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
@@ -1167,15 +1725,98 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
   });
 }
 
-StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand,
-                                      int64 num_parameters) {
-  return Unimplemented("IsConstant is not implemented.");
+StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  // Verify that the handle is valid.
+  TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
+
+  bool is_constant = true;
+  std::set<int64> visited;
+  IsConstantVisitor(operand.handle(), &visited, &is_constant);
+  return is_constant;
 }
 
-StatusOr<std::unique_ptr<Literal>> XlaBuilder::ComputeConstant(
-    const XlaOp& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  return Unimplemented("ComputeConstant is not implemented");
+StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+    const XlaOp& root_op) const {
+  TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
+  if (!is_constant) {
+    auto op_status = LookUpInstruction(root_op);
+    string op_string =
+        op_status.ok() ? op_status.ValueOrDie()->name() : "<unknown operation>";
+    return InvalidArgument(
+        "Operand to BuildConstantSubGraph depends on a parameter.\n\n"
+        "  op requested for constant subgraph: %s\n\n"
+        "This is an internal error that typically happens when the XLA user "
+        "(e.g. TensorFlow) is attempting to determine a value that must be a "
+        "compile-time constant (e.g. an array dimension) but it is not capable "
+        "of being evaluated at XLA compile time.\n\n"
+        "Please file a usability bug with the framework being used (e.g. "
+        "TensorFlow).",
+        op_string.c_str());
+  }
+
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
+  if (!CanBeRoot(opcode)) {
+    return InvalidArgument("the operand with opcode %s cannot be root",
+                           root->opcode().c_str());
+  }
+
+  HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id(), "_compute_constant"));
+  entry.set_root_id(root->id());
+  ProgramShape* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() = root->shape();
+
+  // We use std::set to keep the instruction ids in ascending order (which is
+  // also a valid denpendency order). The related ops will be added to the
+  // subgraph in the same order.
+  std::set<int64> related_ops;
+  tensorflow::gtl::FlatSet<int64> related_calls;  // Related computations.
+  std::queue<int64> worklist;
+  worklist.push(root->id());
+  related_ops.insert(root->id());
+  while (!worklist.empty()) {
+    int64 node = worklist.front();
+    worklist.pop();
+    for (int64 id : instructions_[node].operand_ids()) {
+      if (related_ops.insert(id).second) {
+        worklist.push(id);
+      }
+    }
+    for (int64 called_id : instructions_[node].called_computation_ids()) {
+      related_calls.insert(called_id);
+    }
+  }
+
+  // Add related ops to the computation.
+  for (int64 id : related_ops) {
+    auto* instr = entry.add_instructions();
+    *instr = instructions_[id];
+    // Ensures that the instruction names are unique among the graph.
+    const string& new_name =
+        StrCat(instr->name(), ".", entry.id(), ".", instr->id());
+    instr->set_name(new_name);
+  }
+
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_program_shape() = *program_shape;
+  for (auto& e : embedded_) {
+    if (related_calls.find(e.second.id()) != related_calls.end()) {
+      *module->add_computations() = e.second;
+    }
+  }
+  *module->add_computations() = std::move(entry);
+
+  return std::move(computation);
 }
 
 std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
@@ -1186,10 +1827,6 @@ std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
   return sub_builder;
 }
 
-Status XlaBuilder::SetReturnValue(const XlaOp& operand) {
-  return Unimplemented("SetReturnValue is not implemented.");
-}
-
 /* static */ ConvolutionDimensionNumbers
 XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   ConvolutionDimensionNumbers dimension_numbers;
@@ -1269,10 +1906,7 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
-    instr.set_name(StrCat(instr.opcode(), ".", unique_id_, ".", handle));
-  } else {
-    // Append the handle to make sure the name is unique.
-    instr.set_name(StrCat(instr.name(), ".", unique_id_, ".", handle));
+    instr.set_name(StrCat(instr.opcode()));
   }
   for (const auto& operand : operands) {
     if (operand.builder_ == nullptr) {
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 0673b86646eeecae45b1076baf0002ed94242846..1f7c731064dc004adcac56547e4717ff1638a491 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -53,6 +53,7 @@ class XlaBuilder;
 class XlaOp {
  public:
   XlaOp() : handle_(0), builder_(nullptr) {}
+  ~XlaOp() {}
 
   StatusOr<Shape> GetShape() const;
 
@@ -687,11 +688,12 @@ class XlaBuilder {
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with index greater than or equal to
-  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
-  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
-  // compile-time constant without evaluating the computation.
-  StatusOr<bool> IsConstant(const XlaOp& operand, int64 num_parameters = 0);
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
   // Normalizes operand across spatial and batch dimensions for each feature.
   //
@@ -731,47 +733,14 @@ class XlaBuilder {
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
-  // Computes the value of a constant indicated by a XlaOp using a non-optimized
-  // interpreter on the host.
-  //
-  // The operand must represent a constant value, which in this case
-  // means that it must not statically depend on any parameter of the
-  // computation that is being built other then the ones specified on the
-  // parameter list. The parameters in the list will be indexed by their
-  // parameter id property so the number of parameters specified should be at
-  // least as many as the largest used parameter index.
-  //
-  // `IsConstant` can be used to test whether a computation is a compile-time
-  // constant without evaluation it. `ComputeConstant` only succeeds for
-  // computations where `IsConstant` returns true.
-  //
-  // This functionality can be useful when translating a computation
-  // into XLA where something that looked dynamic is required by
-  // XLA to be specified as a constant. E.g. the source
-  // computation (outside of XLA) may include a dynamic
-  // computation of the shape of something and ComputeConstant lets
-  // you determine what the value of that computation is in the case
-  // where the value can be determined at compile time.
-  //
-  // If output_layout is non-null, then the output of the computation
-  // will be stored using that layout.
-  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
-      const XlaOp& operand, const Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {});
-
   // Returns a new XlaBuilder whose resultant Computation is used only by this
   // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
   // behavior as the parent.
   std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
 
-  // Modifies the computation being built so that executions of it will return
-  // the value associated with operand, rather than the last expression enqueued
-  // on the XlaBuilder. Any subsequent operations added to the XlaBuilder will
-  // not have any effect unless SetReturnValue is called again.
-  Status SetReturnValue(const XlaOp& operand);
-
   // Builds the computation with the requested operations, or returns a non-ok
-  // status.
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
   StatusOr<XlaComputation> Build();
 
   // Builds the computation with the requested operations, or notes an error in
@@ -784,6 +753,12 @@ class XlaBuilder {
   // instead.
   XlaComputation BuildAndNoteError();
 
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
   // XlaOp and inform the user of the error that occurred while
@@ -796,7 +771,7 @@ class XlaBuilder {
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
   // Returns the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape();
+  StatusOr<ProgramShape> GetProgramShape() const;
 
  private:
   StatusOr<XlaOp> AddInstruction(
@@ -851,10 +826,31 @@ class XlaBuilder {
 
   // Returns the (inferred) result for the program shape for the current
   // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id);
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
 
-  string name_;      // Name to use for the built computation.
-  int64 unique_id_;  // The unique id for the built computation.
+  string name_;  // Name to use for the built computation.
 
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
@@ -964,6 +960,37 @@ XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
   return ConstantFromArray(values);
 }
 
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+//
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 2a3c6952667a434b68ca0c5e4e9874397da173d3..7ad212aa24cd32d104cc4db7aa164c22c9f5be8f 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -30,6 +30,10 @@ namespace xla {
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
+  XlaComputation(const HloModuleProto& proto)
+      : unique_id_(proto.id()), proto_(proto) {}
+
+  ~XlaComputation() {}
 
   XlaComputation(const XlaComputation&) = delete;
   XlaComputation& operator=(const XlaComputation&) = delete;
@@ -44,6 +48,9 @@ class XlaComputation {
 
   const HloModuleProto& proto() const { return proto_; }
 
+  // Returns true if this object is a null Computation.
+  bool IsNull() const { return unique_id_ == -1; }
+
  private:
   XlaComputation(const int64 unique_id) : unique_id_(unique_id) {}
   HloModuleProto* mutable_proto() { return &proto_; }
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index f037663e3f296861d0f9ae69175fd9f7b20b55e8..70ae95bf47398589e3c20f72c1f2084a738f253a 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -43,7 +43,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
 #ifdef INTEL_MKL
   flags->set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
-
+  flags->set_xla_gpu_max_kernel_unroll_factor(1);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
@@ -223,6 +223,11 @@ void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
+      tensorflow::Flag(
+          "xla_gpu_max_kernel_unroll_factor",
+          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+          flag_values->xla_gpu_max_kernel_unroll_factor(),
+          "Specify the maximum kernel unroll factor for the GPU backend."),
       tensorflow::Flag(
           "xla_dump_optimized_hlo_proto_to",
           flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 13675b7d0074592043b7e12de0aad948a3e9848f..c315b4ff30059147ee33dcdd5b0858a1c39e5999 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -97,11 +97,18 @@ Literal::Literal(const Shape& shape, bool allocate_arrays)
     const Shape& subshape = piece.subshape();
     if (ShapeUtil::IsArray(subshape)) {
       if (allocate_arrays) {
-        piece.set_buffer(new char[piece.size_bytes()]);
         if (LayoutUtil::IsSparseArray(subshape)) {
+          // For sparse arrays, the buffer must be of the size of the maximum
+          // number of sparse elements possible.
+          const int64 max_sparse_elements =
+              LayoutUtil::MaxSparseElements(subshape.layout());
+          piece.set_buffer(
+              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
+                                                 subshape.element_type())]);
           piece.set_sparse_indices(new SparseIndexArray(
-              LayoutUtil::MaxSparseElements(subshape.layout()),
-              ShapeUtil::Rank(subshape)));
+              max_sparse_elements, ShapeUtil::Rank(subshape)));
+        } else {
+          piece.set_buffer(new char[piece.size_bytes()]);
         }
       } else {
         piece.set_buffer(nullptr);
@@ -1409,6 +1416,28 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return tensorflow::bit_cast<NativeDestT>(src);
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+// This template specialization is here to make the compiler happy. bit_cast has
+// a static check that the types are the same size. This specialization should
+// never be used because the source and destination types are checked for
+// identical sizes higher up.
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  LOG(FATAL) << "Invalid bitcast between types of different sizes.";
+}
+
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
@@ -1428,21 +1457,33 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+                                             bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  return ConvertBetweenNativeTypes<
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
-      typename primitive_util::PrimitiveTypeToNative<
-          primitive_dest_type>::type>(src_literal);
+  if (bitcast) {
+    return BitcastBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  } else {
+    return ConvertBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  }
 }
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+#define CONVERT_IF_TYPES_MATCH(type)                                    \
+  case (type):                                                          \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal, \
+                                                           bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
     CONVERT_IF_TYPES_MATCH(S32)
@@ -1456,28 +1497,31 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      return ConvertToC64<primitive_src_type>(src_literal);
+      if (!bitcast) {
+        return ConvertToC64<primitive_src_type>(src_literal);
+      }
+      break;
     // Other types are not yet supported.
     default:
-      return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
-          PrimitiveType_Name(primitive_dest_type).c_str());
-  }
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
-    PrimitiveType primitive_dest_type) const {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape()));
-  if (shape().element_type() == primitive_dest_type) {
-    return CloneToUnique();
+      break;
   }
-  switch (shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(*this, primitive_dest_type);
+  return Unimplemented(
+      "Converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
+      PrimitiveType_Name(primitive_dest_type).c_str());
+}
+
+StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
+    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  if (literal.shape().element_type() == primitive_dest_type) {
+    return literal.CloneToUnique();
+  }
+  switch (literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type)                                \
+  case (type):                                                            \
+    return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \
+                                            bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
@@ -1493,12 +1537,35 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
       // Other types are not yet supported.
     default:
       return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(shape().element_type()).c_str(),
+          "%s from type %s to type %s is not implemented.",
+          (bitcast ? "Bitcast converting" : "Converting"),
+          PrimitiveType_Name(literal.shape().element_type()).c_str(),
           PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
 
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+    PrimitiveType primitive_dest_type) const {
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
+}
+
+StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+    PrimitiveType primitive_dest_type) const {
+  if (primitive_util::BitWidth(shape().element_type()) !=
+      primitive_util::BitWidth(primitive_dest_type)) {
+    return InvalidArgument(
+        "Cannot bitcast convert from %s to %s, bit widths are different: %d != "
+        "%d",
+        PrimitiveType_Name(shape().element_type()).c_str(),
+        PrimitiveType_Name(primitive_dest_type).c_str(),
+        primitive_util::BitWidth(shape().element_type()),
+        primitive_util::BitWidth(primitive_dest_type));
+  }
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
+}
+
 StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
     const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index a96a76fbb4e1a46e225d33b715f073c05fe6275a..8aa19222dc4b9175ec72128dfdad448f65c23e91 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -333,11 +333,19 @@ class Literal {
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Converts this literal to another primitive type. Returns an error if the
-  // conversion is not possible. This literal must be array-shaped.
+  // Converts this literal to another primitive type using
+  // static_cast<>. Returns an error if the conversion is not possible. This
+  // literal must be array-shaped.
   StatusOr<std::unique_ptr<Literal>> Convert(
       PrimitiveType primitive_dest_type) const;
 
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
   // Converts this literal to the given shape. Returns an error is the
   // conversion is not possible.
   //
@@ -587,6 +595,12 @@ class Literal {
   template <typename NativeT, typename FnType>
   Status Populate(const FnType& generator);
 
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
+
   // Fills this literal with the given value.
   template <typename NativeT>
   void PopulateWithValue(NativeT value);
@@ -727,7 +741,13 @@ class Literal {
     int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
 
     // Returns the number of elements in this piece's array.
-    int64 element_count() const { return ShapeUtil::ElementsIn(subshape()); }
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
 
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
     // and src must be compatible.
@@ -785,6 +805,10 @@ class Literal {
   // buffer).
   void DeallocateBuffers();
 
+  // Implementation details shared between Populate() and PopulateParallel()
+  template <typename NativeT, typename FnType>
+  Status PopulateInternal(const FnType& generator, bool parallel);
+
   Shape shape_;
   ShapeTree<Piece> pieces_;
 
@@ -835,8 +859,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::ArraySlice<NativeT>(
-      reinterpret_cast<const NativeT*>(buffer()),
-      ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<const NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -849,7 +872,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::MutableArraySlice<NativeT>(
-      reinterpret_cast<NativeT*>(buffer()), ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -1264,19 +1287,20 @@ void Literal::PopulateSparse(SparseIndexArray indices,
   CHECK_LE(num_elements, max_elements);
   CHECK_EQ(num_elements, indices.index_count());
   auto root_data = root_piece().data<NativeT>();
-  root_data.remove_suffix(max_elements - values.size());
+  // Piece::data() returns an ArraySlice of size equal to the number of indices
+  // in the SparseIndexArray. So there is no need to adjust the size of the data
+  // here. It is enough to just copy the incoming values into the data buffer.
   std::copy(values.begin(), values.end(), root_data.begin());
   *this->root_piece().sparse_indices() = std::move(indices);
   if (sort) {
     auto root_data = this->root_piece().data<NativeT>();
-    root_data.remove_suffix(root_data.size() - num_elements);
     this->root_piece().sparse_indices()->SortWithValues(root_data);
   }
   DCHECK(this->root_piece().sparse_indices()->Validate(shape()));
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::Populate(const FnType& generator) {
+Status Literal::PopulateInternal(const FnType& generator, bool parallel) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
@@ -1286,11 +1310,11 @@ Status Literal::Populate(const FnType& generator) {
   if (rank > 0) {
     StrideConfig stride_config(this_shape, this_shape,
                                AsInt64Slice(this_shape.dimensions()));
-    DimensionVector minor_scan_indexes(rank, 0);
     int64 minor_dimension_size =
         ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
 
     auto init_function = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+      DimensionVector minor_scan_indexes(rank, 0);
       const int64 index =
           IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
       std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
@@ -1298,17 +1322,35 @@ Status Literal::Populate(const FnType& generator) {
         minor_scan_indexes[stride_config.minor_dimension] = i;
         literal_data.at(index + i) = generator(minor_scan_indexes);
       }
-      return true;
     };
-    ShapeUtil::ForEachIndex(this_shape, stride_config.base,
-                            stride_config.dimensions, stride_config.step,
-                            init_function);
+    if (parallel) {
+      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
+                                      stride_config.dimensions,
+                                      stride_config.step, init_function);
+    } else {
+      ShapeUtil::ForEachIndex(
+          this_shape, stride_config.base, stride_config.dimensions,
+          stride_config.step,
+          [&init_function](tensorflow::gtl::ArraySlice<int64> indexes) {
+            init_function(indexes);
+            return true;
+          });
+    }
   } else {
     // For scalars.
     literal_data.at(0) = generator({});
   }
   return Status::OK();
 }
+template <typename NativeT, typename FnType>
+Status Literal::Populate(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/false);
+}
+
+template <typename NativeT, typename FnType>
+Status Literal::PopulateParallel(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/true);
+}
 
 template <typename NativeT>
 void Literal::PopulateWithValue(NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 7627762074b6132655c58690a7fffbaf2717e279..61046784e05623cd3117c24ecc6d6c474739bbd5 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -217,9 +218,7 @@ TEST_F(LiteralUtilTest, CreateSparse) {
   EXPECT_EQ(literal->sparse_indices()->data(),
             ArraySlice<int64>(expected_indices.data(),
                               expected_indices.num_elements()));
-  EXPECT_EQ(
-      ArraySlice<int64>(literal->data<int64>().data(), expected_values.size()),
-      ArraySlice<int64>(expected_values));
+  EXPECT_EQ(literal->data<int64>(), ArraySlice<int64>(expected_values));
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
@@ -1090,6 +1089,48 @@ TEST_F(LiteralUtilTest, Populate) {
   }
 }
 
+TEST_F(LiteralUtilTest, PopulateParallel) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{0}, {0}},
+      {{16}, {0}},
+      {{2, 0}, {1, 0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = Literal::CreateFromShape(shape);
+    auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
+                                                           indexes) +
+             17;
+    };
+    TF_EXPECT_OK(literal->PopulateParallel<uint32>(generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](ArraySlice<int64> indexes) {
+      auto value = literal->Get<uint32>(indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
 TEST_F(LiteralUtilTest, ConvertR4) {
   // clang-format off
   auto original = Literal::CreateR4WithLayout<int8>({{
@@ -1243,6 +1284,25 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
             tensorflow::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, BitcastConvert) {
+  auto original =
+      Literal::CreateR1<uint32>({tensorflow::bit_cast<uint32>(2.5f),
+                                 tensorflow::bit_cast<uint32>(-42.25f),
+                                 tensorflow::bit_cast<uint32>(100.f), 0xbeef});
+  auto expected = Literal::CreateR1<float>(
+      {2.5f, -42.25f, 100.0f, tensorflow::bit_cast<float>(0xbeef)});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> converted,
+                          original->BitcastConvert(F32));
+}
+
+TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
+  auto literal = Literal::CreateR0<uint32>(1234);
+  Status status = literal->BitcastConvert(F64).status();
+  EXPECT_NE(Status::OK(), status);
+  EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(),
+                                                "bit widths are different"));
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..977f8637873a4b6555798f533010a28ff36e8679
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -0,0 +1,79 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_proto_library",
+    "xla_py_grpc_library",
+)
+
+xla_proto_library(
+    name = "xla_service_proto",
+    srcs = ["xla_service.proto"],
+    use_grpc_plugin = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+    ],
+)
+
+cc_library(
+    name = "grpc_stub",
+    srcs = ["grpc_stub.cc"],
+    hdrs = ["grpc_stub.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla:service_interface",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+)
+
+tf_cc_binary(
+    name = "grpc_service_main_cpu",
+    srcs = ["grpc_service_main.cc"],
+    deps = [
+        ":grpc_service",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_client_test",
+    srcs = ["grpc_client_test.cc"],
+    data = [
+        "//tensorflow/compiler/xla/rpc:grpc_service_main_cpu",
+    ],
+    deps = [
+        ":grpc_stub",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_service",
+    srcs = ["grpc_service.cc"],
+    hdrs = ["grpc_service.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b559ee4b5a345dbb2cc481b571562a0a630b3294
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple C++ test to exercise the GRPC capabilities of XLA.
+//
+// Launches an RPC service in a subprocess and connects to it over a socket
+// using an RPCStub.
+#include <memory>
+#include <vector>
+
+#include "grpc++/create_channel.h"
+#include "grpc++/security/credentials.h"
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class GRPCClientTestBase : public ::testing::Test {
+ protected:
+  GRPCClientTestBase() {
+    string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+    string service_main_path = tensorflow::io::JoinPath(
+        test_srcdir, "compiler/xla/rpc/grpc_service_main_cpu");
+    int port = tensorflow::internal::PickUnusedPortOrDie();
+    subprocess_.SetProgram(
+        service_main_path,
+        {service_main_path, tensorflow::strings::Printf("--port=%d", port)});
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT,
+                                 tensorflow::ACTION_DUPPARENT);
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDERR,
+                                 tensorflow::ACTION_DUPPARENT);
+    CHECK(subprocess_.Start());
+    LOG(INFO) << "Launched subprocess";
+
+    auto channel =
+        ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port),
+                              ::grpc::InsecureChannelCredentials());
+    channel->WaitForConnected(gpr_time_add(
+        gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN)));
+    LOG(INFO) << "Channel to server is connected on port " << port;
+
+    xla_service_ = grpc::XlaService::NewStub(channel);
+    stub_.reset(new GRPCStub(xla_service_.get()));
+    client_.reset(new Client(stub_.get()));
+  }
+
+  ~GRPCClientTestBase() override {
+    LOG(INFO) << "Killing subprocess";
+    subprocess_.Kill(SIGKILL);
+  }
+
+  tensorflow::SubProcess subprocess_;
+  std::unique_ptr<grpc::XlaService::Stub> xla_service_;
+  std::unique_ptr<GRPCStub> stub_;
+  std::unique_ptr<Client> client_;
+};
+
+TEST_F(GRPCClientTestBase, ItsAlive) {
+  ASSERT_NE(xla_service_, nullptr);
+  ASSERT_NE(stub_, nullptr);
+  ASSERT_NE(client_, nullptr);
+}
+
+TEST_F(GRPCClientTestBase, AxpyTenValues) {
+  ComputationBuilder builder(client_.get(), "axpy_10");
+  auto alpha = builder.ConstantR0<float>(3.1415926535);
+  auto x = builder.ConstantR1<float>(
+      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
+  auto y = builder.ConstantR1<float>(
+      {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
+  auto ax = builder.Mul(alpha, x);
+  auto axpy = builder.Add(ax, y);
+
+  std::vector<float> expected = {
+      1.85840735, -1.85840735, 2.28318531,   -2.28318531,  -6.42477796,
+      6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327};
+  std::unique_ptr<Literal> expected_literal =
+      Literal::CreateR1<float>(expected);
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
+                                                   computation, {}, nullptr));
+  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
+                              ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414829d6e76354672c7c1998d1fb1bd185043d78
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
+    perftools::gputools::Platform* platform) {
+  std::unique_ptr<GRPCService> grpc_service(new GRPCService());
+  TF_ASSIGN_OR_RETURN(grpc_service->service_,
+                      ::xla::Service::NewService(platform));
+  return std::move(grpc_service);
+}
+
+::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
+  tensorflow::Status s = op();
+  return tensorflow::ToGrpcStatus(s);
+}
+
+::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
+                                        const ComputationRequest* arg,
+                                        ComputationResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Computation(arg, result); });
+}
+
+::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
+                                     const OpRequest* arg, OpResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Op(arg, result); });
+}
+
+::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
+                                       const UnregisterRequest* arg,
+                                       UnregisterResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Unregister(arg, result); });
+}
+
+::grpc::Status GRPCService::DeconstructTuple(::grpc::ServerContext* context,
+                                             const DeconstructTupleRequest* arg,
+                                             DeconstructTupleResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->DeconstructTuple(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
+                                           const SetReturnValueRequest* arg,
+                                           SetReturnValueResponse* results) {
+  return DelegateRPC([this, arg, results]() {
+    return service_->SetReturnValue(arg, results);
+  });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
+}
+
+::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
+                                         const ExecuteAsyncRequest* arg,
+                                         ExecuteAsyncResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
+}
+
+::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
+                                             const WaitForExecutionRequest* arg,
+                                             WaitForExecutionResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->WaitForExecution(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToClient(::grpc::ServerContext* context,
+                                             const TransferToClientRequest* arg,
+                                             TransferToClientResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToClient(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToServer(::grpc::ServerContext* context,
+                                             const TransferToServerRequest* arg,
+                                             TransferToServerResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToServer(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToInfeed(::grpc::ServerContext* context,
+                                             const TransferToInfeedRequest* arg,
+                                             TransferToInfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToInfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferFromOutfeed(
+    ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+    TransferFromOutfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferFromOutfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::ResetDevice(::grpc::ServerContext* context,
+                                        const ResetDeviceRequest* arg,
+                                        ResetDeviceResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ResetDevice(arg, result); });
+}
+
+::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
+                                       const IsConstantRequest* arg,
+                                       IsConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->IsConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
+                                            const ComputeConstantRequest* arg,
+                                            ComputeConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
+                                     const GetShapeRequest* arg,
+                                     GetShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationShape(
+    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+    GetComputationShapeResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationShape(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
+                                          const GetLocalShapeRequest* arg,
+                                          GetLocalShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationStats(
+    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
+    ComputationStatsResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationStats(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SnapshotComputation(
+    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+    SnapshotComputationResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->SnapshotComputation(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::LoadComputationSnapshot(
+    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+    LoadComputationSnapshotResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->LoadComputationSnapshot(arg, result);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c9e484517e9ced45c40dda78a2bd427a24c2722
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+
+#include "grpc++/server_context.h"
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service/service.h"
+
+namespace xla {
+
+// Service implementation which wraps a XLA Service with a GRPC interface.
+class GRPCService : public grpc::XlaService::Service {
+ public:
+  // Factory for creating a RPCService. The parameter platform is the platform
+  // that the service should target. If platform is null then the default
+  // platform is used.
+  static StatusOr<std::unique_ptr<GRPCService>> NewService(
+      perftools::gputools::Platform* platform = nullptr);
+
+  ::grpc::Status Computation(::grpc::ServerContext* context,
+                             const ComputationRequest* arg,
+                             ComputationResponse* result) override;
+
+  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
+                          OpResponse* result) override;
+
+  ::grpc::Status Unregister(::grpc::ServerContext* context,
+                            const UnregisterRequest* arg,
+                            UnregisterResponse* result) override;
+
+  ::grpc::Status DeconstructTuple(::grpc::ServerContext* context,
+                                  const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) override;
+
+  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
+                                const SetReturnValueRequest* arg,
+                                SetReturnValueResponse* results) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
+
+  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
+                              const ExecuteAsyncRequest* arg,
+                              ExecuteAsyncResponse* result) override;
+
+  ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
+                                  const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) override;
+
+  ::grpc::Status TransferToClient(::grpc::ServerContext* context,
+                                  const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) override;
+
+  ::grpc::Status TransferToServer(::grpc::ServerContext* context,
+                                  const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) override;
+
+  ::grpc::Status TransferToInfeed(::grpc::ServerContext* context,
+                                  const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) override;
+
+  ::grpc::Status TransferFromOutfeed(
+      ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  ::grpc::Status ResetDevice(::grpc::ServerContext* context,
+                             const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) override;
+
+  ::grpc::Status IsConstant(::grpc::ServerContext* context,
+                            const IsConstantRequest* arg,
+                            IsConstantResponse* result) override;
+
+  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
+                                 const ComputeConstantRequest* arg,
+                                 ComputeConstantResponse* result) override;
+
+  ::grpc::Status GetShape(::grpc::ServerContext* context,
+                          const GetShapeRequest* arg,
+                          GetShapeResponse* result) override;
+
+  ::grpc::Status GetComputationShape(
+      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
+                               const GetLocalShapeRequest* arg,
+                               GetLocalShapeResponse* result) override;
+
+  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
+                                     const ComputationStatsRequest* arg,
+                                     ComputationStatsResponse* result) override;
+
+  ::grpc::Status SnapshotComputation(
+      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+      SnapshotComputationResponse* result) override;
+
+  ::grpc::Status LoadComputationSnapshot(
+      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+      LoadComputationSnapshotResponse* result) override;
+
+ private:
+  std::unique_ptr<::xla::Service> service_;
+
+  GRPCService() {}
+  GRPCService(const GRPCService&) = delete;
+  void operator=(const GRPCService&) = delete;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e29908ccec80db76e3b5b856e57382c56430c379
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic server binary that exposes a xla::Service through a GRPC interface
+// on a configurable port.
+#include "grpc++/security/server_credentials.h"
+#include "grpc++/server.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace {
+
+int RealMain(int argc, char** argv) {
+  int32 port = 1685;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("port", &port, "port to listen on"),
+  };
+  string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parsed_values_ok) {
+    LOG(ERROR) << usage;
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  std::unique_ptr<xla::GRPCService> service =
+      xla::GRPCService::NewService().ConsumeValueOrDie();
+
+  ::grpc::ServerBuilder builder;
+  string server_address(tensorflow::strings::Printf("localhost:%d", port));
+
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+
+  LOG(INFO) << "Server listening on " << server_address;
+  server->Wait();
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) { return xla::RealMain(argc, argv); }
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1f2b0abe39b10dd82b700941748bc4f4e8cb2f8
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -0,0 +1,244 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+GRPCStub::~GRPCStub() = default;
+
+tensorflow::Status MakeRPC(
+    const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
+  ::grpc::ClientContext context;
+  ::grpc::Status s = rpc_method(&context);
+  return tensorflow::FromGrpcStatus(s);
+}
+
+tensorflow::Status GRPCStub::TransferToClient(
+    const TransferToClientRequest* request,
+    TransferToClientResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToClient(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToServer(
+    const TransferToServerRequest* request,
+    TransferToServerResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToServer(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToInfeed(
+    const TransferToInfeedRequest* request,
+    TransferToInfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToInfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferFromOutfeed(
+    const TransferFromOutfeedRequest* request,
+    TransferFromOutfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferFromOutfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                                         ResetDeviceResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ResetDevice(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::LoadComputationSnapshot(
+    const LoadComputationSnapshotRequest* request,
+    LoadComputationSnapshotResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
+                                     ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                                          ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraph(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteParallel(
+    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* request,
+    ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraphParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
+                                          ExecuteAsyncResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteAsync(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::WaitForExecution(
+    const WaitForExecutionRequest* request,
+    WaitForExecutionResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->WaitForExecution(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::DeconstructTuple(
+    const DeconstructTupleRequest* request,
+    DeconstructTupleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->DeconstructTuple(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationStats(
+    const ComputationStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationGraphStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationShape(
+    const GetComputationShapeRequest* request,
+    GetComputationShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
+                                      GetShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetDeviceHandles(
+    const GetDeviceHandlesRequest* request,
+    GetDeviceHandlesResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetDeviceHandles(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::CreateChannelHandle(
+    const CreateChannelHandleRequest* request,
+    CreateChannelHandleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateChannelHandle(context, *request, response);
+  });
+}
+
+// Methods used by ComputationBuilder.
+tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
+                                         ComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Computation(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Op(const OpRequest* request,
+                                OpResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateOp(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
+                                           GetLocalShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetLocalShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::SetReturnValue(
+    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
+  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
+    return grpc_stub_->SetReturnValue(context, *request, responses);
+  });
+}
+
+tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
+                                        IsConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->IsConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstant(
+    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* request,
+    ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstantGraph(context, *request, response);
+  });
+}
+
+// Methods used by Computation.
+tensorflow::Status GRPCStub::SnapshotComputation(
+    const SnapshotComputationRequest* request,
+    SnapshotComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->SnapshotComputation(context, *request, response);
+  });
+}
+
+// Methods used by GlobalData.
+tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
+                                        UnregisterResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Unregister(context, *request, response);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd9810d4f1a5e084b73e83007ea7f9f8b0462c72
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service_interface.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+class GRPCStub : public ServiceInterface {
+ public:
+  explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
+  ~GRPCStub() override;
+
+  tensorflow::Status TransferToClient(
+      const TransferToClientRequest* arg,
+      TransferToClientResponse* result) override;
+
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override;
+
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override;
+
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override;
+
+  tensorflow::Status LoadComputationSnapshot(
+      const LoadComputationSnapshotRequest* request,
+      LoadComputationSnapshotResponse* result) override;
+
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override;
+
+  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
+                                  ExecuteResponse* response) override;
+
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override;
+
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* request,
+      ExecuteParallelResponse* response) override;
+
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override;
+
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override;
+
+  tensorflow::Status DeconstructTuple(
+      const DeconstructTupleRequest* arg,
+      DeconstructTupleResponse* result) override;
+
+  tensorflow::Status GetComputationStats(
+      const ComputationStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* request,
+      ComputationStatsResponse* response) override;
+
+  tensorflow::Status GetComputationShape(
+      const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  tensorflow::Status GetShape(const GetShapeRequest* arg,
+                              GetShapeResponse* result) override;
+
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override;
+
+  tensorflow::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) override;
+
+  // Methods used by ComputationBuilder.
+  tensorflow::Status Computation(const ComputationRequest* arg,
+                                 ComputationResponse* result) override;
+
+  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
+  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
+                                   GetLocalShapeResponse* result) override;
+
+  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
+                                    SetReturnValueResponse* results) override;
+
+  tensorflow::Status IsConstant(const IsConstantRequest* arg,
+                                IsConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
+                                     ComputeConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
+
+  // Methods used by Computation.
+  tensorflow::Status SnapshotComputation(
+      const SnapshotComputationRequest* ag,
+      SnapshotComputationResponse* result) override;
+
+  // Methods used by GlobalData.
+  tensorflow::Status Unregister(const UnregisterRequest* arg,
+                                UnregisterResponse* result) override;
+
+  grpc::XlaService::Stub* service() { return grpc_stub_; }
+
+ private:
+  grpc::XlaService::Stub* grpc_stub_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GRPCStub);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c47164ee1b7657ae378a053f553442bee751753e
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA service API.
+//
+// Users 1) build up computations and 2) create allocations via this API.
+// Computations are composed of data flowing between arbitrarily-sized
+// vector-oriented operations.
+//
+// Users build up computations using a ComputationHandle, and talk about
+// allocations using GlobalDataHandles.
+//
+// There are currently no checkpointing capabilities or distribution/replication
+// guarantees. The service runs on a single machine (e.g. one task) and that is
+// its failure domain.
+//
+// Canonical example of "alpha * X + Y":
+// * Make a computation.
+// * Add alpha and X and Y as parameters.
+// * Request the multiplication of alpha and X.
+// * Request the addition of that result and Y.
+//
+// Then, pass the computation and appropriately shaped inputs to the XLA
+// service's Execute method, which provides a result as a GlobalDataHandle.
+//
+// All data in XLA computations are conceptually immutable.
+//
+// Note: this API is subject to change / refinement over time -- use the
+// provided client libraries to insulate code from changes to this service API.
+
+syntax = "proto3";
+
+import "tensorflow/compiler/xla/xla.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+package xla;
+
+service XlaService {
+  /////////////////////////
+  // Global data requests
+
+  // Unregisters a global allocation.
+  //
+  // If the handle given is not currently allocated, a NOT_FOUND status is
+  // returned.
+  rpc Unregister(UnregisterRequest) returns (UnregisterResponse) {
+  }
+
+  // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
+  // element in the tuple.
+  rpc DeconstructTuple(DeconstructTupleRequest)
+      returns (DeconstructTupleResponse) {
+  }
+
+  // Unpack requests that a global data handle, with a tuple shape, has global
+  // data handles created for each of its constituent members. This is the
+  // equivalent of the "destructuring assignment" present in various programming
+  // languages.
+  rpc Unpack(UnpackRequest) returns (UnpackResponse) {
+  }
+
+  // Requests the shape of the referenced global data.
+  rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
+  }
+
+  // Requests the program shape of the referenced computation.
+  rpc GetComputationShape(GetComputationShapeRequest)
+      returns (GetComputationShapeResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  rpc GetComputationStats(ComputationStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc GetComputationGraphStats(ComputationGraphStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Loads a variable number of values with a given element type from ColumnIO.
+  rpc LoadData(LoadDataRequest) returns (LoadDataResponse) {
+  }
+
+  // Transfers the given global data to the client in the form of a Literal.
+  rpc TransferToClient(TransferToClientRequest)
+      returns (TransferToClientResponse) {
+  }
+
+  // Transfers the given literal to the server to be stored in a global
+  // allocation, which is returned.
+  rpc TransferToServer(TransferToServerRequest)
+      returns (TransferToServerResponse) {
+  }
+
+  // Transfers the given literal to the Infeed buffer of the device.
+  rpc TransferToInfeed(TransferToInfeedRequest)
+      returns (TransferToInfeedResponse) {
+  }
+
+  // Transferred literal from the Outfeed buffer of the device.
+  rpc TransferFromOutfeed(TransferFromOutfeedRequest)
+      returns (TransferFromOutfeedResponse) {
+  }
+
+  // Resets the device, clearing all existing state on the device.
+  rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
+  }
+
+  // Tests if an expression is a compile-time constant.
+  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
+  }
+
+  // Computes the value of a constant expression.
+  rpc ComputeConstant(ComputeConstantRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Computes the value of a constant expression. The request contains the
+  // computation graph for the constant expression.
+  rpc ComputeConstantGraph(ComputeConstantGraphRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Retrieves the inferred shape for a value within a computation.
+  rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) {
+  }
+
+  // Requests one or more device handles from the target. The returned device
+  // handles can be used to specify the device on which to execute computations
+  // or transfer data.
+  rpc GetDeviceHandles(GetDeviceHandlesRequest)
+      returns (GetDeviceHandlesResponse) {
+  }
+
+  // Creates a channel handle that can be used to transfer data between
+  // two computations via a pair of Send and Recv instructions.
+  rpc CreateChannelHandle(CreateChannelHandleRequest)
+      returns (CreateChannelHandleResponse) {
+  }
+
+  // Requests that the referenced computation be specialized for the provided
+  // arguments for subsequent execution. This permits things such as value
+  // specialization.
+  rpc Specialize(SpecializeRequest) returns (SpecializeResponse) {
+  }
+
+  // Modifies the provided computation so that subsequent executions
+  // will compute the provided ComputationDataHandle, rather than the
+  // last expression enqueued on that Computation.
+  rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
+  }
+
+  // Computation creates a new computation with the given name.
+  // A unique ComputationHandle is returned.
+  rpc Computation(ComputationRequest) returns (ComputationResponse) {
+  }
+
+  // Adds a new op to a computation.
+  rpc CreateOp(OpRequest) returns (OpResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns global data output and execution timing.
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. The request contains the whole computation graph.
+  // Returns global data output and execution timing.
+  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  rpc ExecuteParallel(ExecuteParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns a handle to the execution.
+  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
+  }
+
+  // Waits until the given execution (aysnchronously launched) is complete, and
+  // returns the global data output.
+  rpc WaitForExecution(WaitForExecutionRequest)
+      returns (WaitForExecutionResponse) {
+  }
+
+  // Serializes a computation to proto form, so it can be loaded via
+  // LoadComputationSnapshot.
+  rpc SnapshotComputation(SnapshotComputationRequest)
+      returns (SnapshotComputationResponse) {
+  }
+
+  // Loads a computation from a captured snapshot.
+  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
+      returns (LoadComputationSnapshotResponse) {
+  }
+}
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index db91e804072676c609d4d1fa3110bd587f5f2bc0..ddc099807d36ecfccfa81f6718776d0cab60d406 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -302,6 +302,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "pattern_matcher",
+    hdrs = ["pattern_matcher.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_test",
+    srcs = ["pattern_matcher_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
@@ -2535,6 +2558,7 @@ cc_library(
     srcs = ["hlo_runner.cc"],
     hdrs = ["hlo_runner.h"],
     deps = [
+        ":computation_placer",
         ":executable",
         ":hlo",
         ":transfer_manager",
@@ -2551,6 +2575,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 0e4624fd69e623efca780937c5347dbf6bb9afe1..6cb1bd56695772a38c377280da4ea357027519e5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1424,6 +1424,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
+// TODO(b/74536353): do this simplification for BroadcastDimOne as well.
 StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 1e439cde11cf74272101b80c867a308e51ab26a6..54af40506dab48b3c2a3a44eb0b5f5fb213a32ec 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -29,7 +29,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
     : LoopEmitter(target_element_generator, target_array, ir_builder),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
@@ -69,7 +70,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK(exit_bb_ != nullptr);
 
-  return array_index;
+  return {array_index};
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index ce92e36a944de33b991d97460f0b2e859ad56081..755715634aa70a822b21d25dcae20a8fe053477a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -60,7 +60,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e765048698baedc50ae7b189d0287ee56b8..3f7089d6ca1e1a3b9bb42028327ba54ba4b93974 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -199,6 +199,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 240faebe62f5cee4f61b3c36b5e8f653cfd6db8e..e6680ee9b87e1a01782204047c3b2104995c11ed 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -158,6 +158,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
+  Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override {
+    return DefaultAction(broadcastDimOne);
+  }
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1792893ae401bf16d2dd9e861607e8f3821a505e..1eccfe8571ceb5b082f2b47473a38d7405d790b7 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -94,11 +94,17 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
-//
-// TODO(jlebar): We shouldn't need this check for cuDNNv7.
-bool ShouldIncludeWinogradNonfusedAlgo(
-    const Shape& input_shape, const Shape& output_shape,
-    const ConvolutionDimensionNumbers& dnums) {
+bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const ConvolutionDimensionNumbers& dnums,
+                                       se::StreamExecutor* stream_exec) {
+  // Skip this check for cudnn7 and newer.
+  auto version =
+      stream_exec->AsDnn()->GetVersion();
+  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
+    return true;
+  }
+
   int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
   int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
   int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
@@ -118,20 +124,20 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          bool with_winograd_nonfused,
-                                         se::StreamExecutor* stream_exec_) {
+                                         se::StreamExecutor* stream_exec) {
   std::vector<AlgorithmDesc> algorithms;
   switch (kind) {
     case CudnnConvKind::kBackwardFilter:
-      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kBackwardInput:
-      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kForward:
-      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
-                                                &algorithms));
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
+                                               &algorithms));
       break;
   }
 
@@ -209,8 +215,8 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     return nullopt;
   }
 
-  const bool use_winograd_nonfused =
-      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
+      input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 32413f975a40c1abc334b16e81097bb44f56a44a..532d436ee82b985a4efe300f90223e1298e85765 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -160,14 +160,19 @@ static HloInstruction* CreateCudnnConv(
   Shape call_shape =
       ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
 
-  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
-  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
-  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn
+  // algorithm to use, and a boolean indicating whether to use tensor cores.
+  //
+  // It's up to a later pass to choose the algorithm and decide whether to use
+  // tensor cores, so to indicate that we haven't yet made a choice, we speicfy
+  // -1 and false for those args.
   HloInstruction* negative_one = computation->AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* false_constant = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloInstruction* custom_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {lhs, rhs, negative_one}, call_target));
+          call_shape, {lhs, rhs, negative_one, false_constant}, call_target));
   custom_call->set_window(window);
   custom_call->set_convolution_dimension_numbers(dnums);
   return custom_call;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d29cc21ab1c697f8481ed1e94846d4df5ec5c1dc..26e497762f2a6f23767c5b98f339eefdef0b7468 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -536,7 +536,27 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
+
+  int max_unroll_factor = fusion->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_max_kernel_unroll_factor();
+
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
+  int unroll_factor = 1;
+  if (!fusion->IsMultiOutputFusion()) {
+    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+    int64 num_elements = ShapeUtil::ElementsIn(fusion->shape());
+    for (int i = max_unroll_factor; i > 1; i /= 2) {
+      if (num_elements % i == 0) {
+        unroll_factor = i;
+        break;
+      }
+    }
+  }
+
+  thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
 }
 
@@ -2021,7 +2041,7 @@ Status IrEmitterUnnested::HandleGather(HloInstruction* gather) {
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst) {
+    const HloInstruction* inst, int unroll_factor) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
@@ -2113,7 +2133,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   }
 
   return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 inst);
+                                 inst, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2485,21 +2505,28 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  int unroll_factor = thunk->unroll_factor();
   VLOG(3) << bindings_.ToString();
 
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
+  VLOG(3) << "EmitTargetElementLoopInThunk "
+          << ShapeUtil::HumanStringWithLayout(hlo.shape())
+          << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description());
+      element_shape, ir_emitter_context_->device_description(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_)
+                               launch_dimensions, &ir_builder_, unroll_factor)
         .EmitLoop(IrName(&hlo));
   }
 
+  CHECK_EQ(unroll_factor, 1)
+      << "multi-output fusion does not support unrolling";
+
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 66c62e2d2de3ed1668271a21943dc73ed3d77651..b842f480c6257c1a8bee8cdac55e29c5db6801a0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -150,8 +150,10 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst);
+  // Thunk object. The kernel implementation will be unrolled if unroll_factor
+  // is greater than one.
+  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst,
+                                                int unroll_factor = 1);
 
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c20a781a33fe89af4740ed31dd5bfb1a64473057..c24dc1457f83c7557430a69baf806ed05b45adca 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -30,10 +30,12 @@ namespace gpu {
 
 KernelThunk::KernelThunk(
     tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-    const string& kernel_name, const HloInstruction* hlo_instruction)
+    const string& kernel_name, const HloInstruction* hlo_instruction,
+    int unroll_factor)
     : Thunk(Kind::kKernel, hlo_instruction),
       args_(args.begin(), args.end()),
-      kernel_name_(kernel_name) {}
+      kernel_name_(kernel_name),
+      unroll_factor_(unroll_factor) {}
 
 tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
   tensorflow::mutex_lock lock(mutex_);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 9ae455e2fcc253a7a08ff95764721048a16b0bf7..df8971b083fe70588f8c32f977981e365d78fdb8 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -47,12 +47,14 @@ class KernelThunk : public Thunk {
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
   KernelThunk(tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-              const string& kernel_name, const HloInstruction* hlo_instruction);
+              const string& kernel_name, const HloInstruction* hlo_instruction,
+              int unroll_factor);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
 
   const string& kernel_name() const { return kernel_name_; }
+  int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
@@ -69,6 +71,10 @@ class KernelThunk : public Thunk {
   // Entry kernel name for the computation.
   const string kernel_name_;
 
+  // The number of times this kernel should be unrolled. This works as a
+  // multiplier on the number of elements produced by a GPU thread.
+  const int unroll_factor_;
+
   // The thread and block dimension used to launch the kernel.
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index defd281d74bd38f7da3f268e0f55970fc1af8263..df9d9be889ce839ee665cd4820b169c124d9fcde 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.def"
+#include "llvm/CodeGen/CommandFlags.inc"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 388dcc008b07a76ff9ed07df04181e49a8734f51..d8c07dc3119fb81a3ef22822acb11b7c4d5bbca5 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,25 +32,32 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(body_emitter, shape, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_arrays, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_array, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -63,6 +70,9 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   //   "It is guaranteed that [...] 0  <=  %ctaid.x <  %nctaid.x"
   //
   // %nctaid.x is currently specified as 2147483647.
+  VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_;
+  std::vector<llvm_ir::IrArray::Index> array_indices;
+
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
@@ -81,7 +91,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   thread_id = ir_builder_->CreateZExt(thread_id, ir_builder_->getInt64Ty(),
                                       "thread_id");
 
-  llvm::Value* linear_index = ir_builder_->CreateAdd(
+  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
       ir_builder_->CreateMul(
           block_id,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "",
@@ -99,15 +109,30 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
       {ir_builder_->CreateICmpULT(
-          linear_index,
+          linear_index_base,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
                                 launch_dimensions_.block_count()),
           "linear_index_in_range")},
       {}, ir_builder_);
 
+  if (unroll_factor_ > 1) {
+    linear_index_base = ir_builder_->CreateMul(
+        linear_index_base, ir_builder_->getInt64(unroll_factor_),
+        "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
+  }
+
+  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  for (int i = 1; i < unroll_factor_; ++i) {
+    llvm::Value* linear_index = ir_builder_->CreateAdd(
+        linear_index_base, ir_builder_->getInt64(i), "linear_index",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+  }
+
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
       ir_builder_->CreateICmpULT(
-          linear_index, ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
+          linear_index_base,
+          ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
       llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
@@ -116,7 +141,8 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
 
   // Set IR builder insertion point to the body of the if structure.
   llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-  return llvm_ir::IrArray::Index(linear_index, shape_, ir_builder_);
+
+  return array_indices;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 8ed63a854a74fc06c3c389f40fe1f5970885deac..25318b3bed8bf4a2dfe3a4a974269d0405c3bfec 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,18 +50,20 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder);
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
   // The thread and block dimension to parallelize the loop on.
   const LaunchDimensions launch_dimensions_;
+  const int unroll_factor_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 6cf280df05496716a0780d61ded92efd9982734c..5283d51cd10668c43c5ad1c1fb11049555bff5d4 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,12 +44,16 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc) {
+    const Shape& shape, const se::DeviceDescription& device_desc,
+    int unroll_factor) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
+  CHECK_EQ(num_elements % unroll_factor, 0);
+  num_elements = num_elements / unroll_factor;
+
   // Since we don't do any inter-warp communication, we're free to choose any
   // block size we want, subject to hardware constraints.  We choose the
   // smallest block size that allows the GPU to reach full occupancy (assuming
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 0bf463a6ef95d5a32784838c08ad239752fd1acf..42d2d2af2e334da7c42419cb07a2bd5bb9d209d6 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -58,7 +58,8 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc);
+    const perftools::gputools::DeviceDescription& device_desc,
+    int unroll_factor = 1);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 0b446c654779db410ebbd91ef9a5bab14d08a278..8fd7f8945c7c36a451af30fcd5939a2498648e74 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -135,6 +135,10 @@ message HloInstructionProto {
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
   repeated int64 gather_window_bounds = 34;
 
+  // Compute Host.
+  string channel_name = 41;
+  int64 cost_estimate_ns = 42;
+
   // The id of this instruction.
   int64 id = 35;
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 35ecd4428d0dfde2de445ea34472d2c78148c6c9..7aa38c6b79ed904bb4a518c4b7aaa1e079c27ea8 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -69,7 +69,8 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
-      if (instruction->opcode() == HloOpcode::kBroadcast) {
+      if (instruction->opcode() == HloOpcode::kBroadcast ||
+          instruction->opcode() == HloOpcode::kBroadcastDimOne) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 44e4f75f75b275653e1a07111943843fc6f78b33..ea4dd62fdb5bb3be40987d1a6ea96b3a58b0053b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -336,6 +336,11 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleBroadcastDimOne(
+    const HloInstruction* broadcastDimOne) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandlePad(const HloInstruction*) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d17678d20f2a23fd98d18b77d5fb25853901a789..a9f6845747aa2081df936d388551bbc0b75b787b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -95,6 +95,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleSelectAndScatter(const HloInstruction* instruction) override;
   Status HandleBitcast(const HloInstruction* bitcast) override;
   Status HandleBroadcast(const HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 53ad8909c5e6b3a1dfb4c931a24107b5072d6e96..b4f9a9db9cbcae56fbf60ad9d2ef4b3e0ffe2a90 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -399,6 +399,22 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
@@ -998,18 +1014,6 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    // Dimension number applicable for input (lhs).
-    const int64 input_batch_dim = dnums.input_batch_dimension();
-    const int64 input_z_dim = dnums.input_feature_dimension();
-    // Dimension number applicable for kernel (rhs).
-    const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-    const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-    // Dimension number applicable for output.
-    const int64 output_batch_dim = dnums.output_batch_dimension();
-    const int64 output_z_dim = dnums.output_feature_dimension();
-
-    const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
     std::vector<int64> window_dimension_sizes;
     for (auto i : dnums.kernel_spatial_dimensions()) {
       window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
@@ -1021,14 +1025,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
     DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
 
-    DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
-
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
     auto rhs_literal_data = rhs_literal.data<ReturnT>();
 
-    auto func = [&](ArraySlice<int64> out_index) {
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
+
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
-      std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
 
       // Convolve input feature with kernel.
       do {
@@ -1100,7 +1117,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     };
 
     auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 25702dc65ea1ebd9d91b3382dcb909e606628202..c35783c456c63b9a651d1221cf9a3d70af38ba66 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -956,6 +956,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
       // these are essentially free.
       if (instr->IsFused() &&
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index fcf9ebf5f787445f5e89f126e9f2393fd3bd1790..56cb241087cf31084df76c25ead89d477cd38f0f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -159,6 +159,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->fft_length_.push_back(fft_len);
   }
 
+  if (proto.has_sharding()) {
+    TF_ASSIGN_OR_RETURN(const auto& sharding,
+                        HloSharding::FromProto(proto.sharding()));
+    instruction->set_sharding(sharding);
+  }
+
+  if (proto.has_gather_dimension_numbers()) {
+    instruction->gather_dimension_numbers_ =
+        MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
+  }
+  for (int64 bound : proto.gather_window_bounds()) {
+    instruction->gather_window_bounds_.push_back(bound);
+  }
+
+  instruction->channel_name_ = proto.channel_name();
+  instruction->cost_estimate_ns_ = proto.cost_estimate_ns();
+
   return std::move(instruction);
 }
 
@@ -683,6 +700,15 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBroadcastDimOne(const Shape& shape,
+                                      HloInstruction* operand) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape));
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -812,6 +838,16 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   return instruction;
 }
 
+void HloInstruction::SetupDerivedInstruction(
+    HloInstruction* derived_instruction) const {
+  if (sharding_ != nullptr) {
+    derived_instruction->set_sharding(*sharding_);
+  } else {
+    derived_instruction->clear_sharding();
+  }
+  derived_instruction->set_metadata(metadata_);
+}
+
 HloInstruction* HloInstruction::AddFusionOperand(HloInstruction* new_operand) {
   CHECK_EQ(opcode(), HloOpcode::kFusion);
   CHECK_EQ(operand_count(),
@@ -1275,6 +1311,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBroadcast(shape, new_operands[0], dimensions_);
       break;
+    case HloOpcode::kBroadcastDimOne:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateBroadcastDimOne(shape, new_operands[0]);
+      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1450,10 +1490,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
-  clone->set_metadata(metadata_);
-  if (has_sharding()) {
-    clone->set_sharding(sharding());
-  }
+  SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
   return clone;
 }
@@ -1826,6 +1863,8 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcastDimOne:
+    case HloOpcode::kDynamicUpdateSlice:
       return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
       return eq_shapes(shape(), other.shape()) &&
@@ -1844,8 +1883,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDynamicSlice:
       return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
-    case HloOpcode::kDynamicUpdateSlice:
-      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2403,6 +2440,15 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  if (gather_dimension_numbers_ != nullptr) {
+    *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_;
+  }
+  for (int64 bound : gather_window_bounds_) {
+    proto.add_gather_window_bounds(bound);
+  }
+  proto.set_channel_name(channel_name_);
+  proto.set_cost_estimate_ns(cost_estimate_ns_);
+
   return proto;
 }
 
@@ -2646,6 +2692,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBitcast(this);
     case HloOpcode::kBroadcast:
       return visitor->HandleBroadcast(this);
+    case HloOpcode::kBroadcastDimOne:
+      return visitor->HandleBroadcastDimOne(this);
     case HloOpcode::kPad:
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 80f84082442798d240a0a8e11d85ceaf638a4695..49aa07502996b698bb20f2c2e9d1d371d43d1793 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -401,6 +401,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Creates a broadcast-size-one-dimensions instruction.
+  static std::unique_ptr<HloInstruction> CreateBroadcastDimOne(
+      const Shape& shape, HloInstruction* operand);
+
   // Creates a sequence of instructions that performs an explicit broadcast of
   // the operand to the target shape.
   //
@@ -945,6 +949,13 @@ class HloInstruction {
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
 
+  // When creating a new instruction which either replaces, or shifts up (kCopy
+  // insertion case), another instruction, we need to make sure the certain
+  // properties of the new instruction are copied into the derived one. As of
+  // today, the metadata and sharding will be propagated to the derived
+  // instruction.
+  void SetupDerivedInstruction(HloInstruction* derived_instruction) const;
+
   // Adds a new operand the fusion instruction.
   HloInstruction* AddFusionOperand(HloInstruction* new_operand);
 
@@ -1442,7 +1453,7 @@ class HloInstruction {
   string channel_name_;
 
   // Estimate of the duration of a host computation in nanoseconds.
-  int64 cost_estimate_ns_;
+  int64 cost_estimate_ns_ = 0;
 
   // Computations called by this instruction.
   std::vector<HloComputation*> called_computations_;
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39b554f146793594958f373999844b4c..dddc72480f93c4c3cc29f41db99fa773dc8d6b68 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -54,6 +54,7 @@ namespace xla {
   V(kBitcast, "bitcast")                                     \
   V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
+  V(kBroadcastDimOne, "broadcast-dim-one")                   \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index ec7d8210a70ad7498f77fe807abd53544d4b0487..2e834a79d9f63154172798d252be938d0d475c01 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -16,21 +16,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 
-#include <set>
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -91,15 +86,6 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-// Define this in .cc file to avoid having to include eigen or forward declare
-// these types in the header.
-struct HloRunner::EigenThreadPoolWrapper {
-  std::unique_ptr<EigenThreadPoolWrapper> pool;
-  std::unique_ptr<Eigen::ThreadPoolDevice> device;
-};
-
-HloRunner::HloRunner() {}
-
 HloRunner::HloRunner(se::Platform* platform) {
   BackendOptions backend_options;
   backend_options.set_platform(platform);
@@ -113,32 +99,14 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
     const tensorflow::gtl::ArraySlice<Literal*> arguments,
     bool run_hlo_passes) {
-  if (run_hlo_passes) {
-    TF_ASSIGN_OR_RETURN(
-        module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor(),
-                    /*device_allocator=*/nullptr));
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor(),
-                                       /*device_allocator=*/nullptr));
-
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
 
-  ExecutableRunOptions run_options;
-  run_options.set_device_ordinal(backend().default_device_ordinal());
-  run_options.set_stream(&stream);
-  run_options.set_allocator(backend().memory_allocator());
-  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      backend().eigen_intra_op_thread_pool_device());
-
-  ServiceExecutableRunOptions service_run_options(
-      run_options, backend().StreamBorrower(),
-      backend().inter_op_thread_pool());
+  ServiceExecutableRunOptions service_run_options(GetServiceRunOptionsForDevice(
+      backend().default_device_ordinal(), &stream, nullptr));
+  const ExecutableRunOptions& run_options = service_run_options.run_options();
 
   // Copy arguments to device.
   std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
@@ -178,10 +146,153 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
   return result_literal;
 }
 
+StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
+    std::unique_ptr<HloModule> module,
+    const ReplicatedExecuteOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      CreateExecutable(std::move(module), options.run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  std::vector<std::unique_ptr<se::Stream>> streams;
+  std::vector<ServiceExecutableRunOptions> service_run_options;
+  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+  // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
+  // no arguments.
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs(
+      options.num_replicas * options.arguments.size() + 1);
+  std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
+      argument_buffer_slices;
+  int64 index = 0;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    int64 device = device_assignment(i, 0);
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        backend().stream_executor(device));
+    streams.push_back(absl::make_unique<se::Stream>(executor));
+    streams.back()->Init();
+    service_run_options.emplace_back(GetServiceRunOptionsForDevice(
+        device, streams.back().get(), &device_assignment));
+
+    // Copy arguments to device.
+    for (const Literal* argument : options.arguments) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+          backend().transfer_manager()->AllocateScopedShapedBuffer(
+              argument->shape(), backend().memory_allocator(), device));
+      TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+          executor, *argument, *argument_buffer));
+      argument_buffers.push_back(std::move(argument_buffer));
+      argument_buffer_ptrs[index++] = argument_buffers.back().get();
+    }
+    argument_buffer_slices.emplace_back(
+        &argument_buffer_ptrs[index - options.arguments.size()],
+        options.arguments.size());
+  }
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  int64 num_threads = (options.infeed != nullptr) ? options.num_replicas : 0;
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    num_threads += options.num_replicas;
+  }
+  if (num_threads > 0) {
+    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
+        tensorflow::Env::Default(), "infeed_outfeed",
+        /*num_threads=*/num_threads);
+  }
+  if (options.infeed != nullptr) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting infeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralToInfeed(
+              executor, *options.infeed));
+          if (step % 100 == 0) {
+            VLOG(1) << "Infeed step " << step;
+          }
+        }
+      });
+    }
+  }
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      int64 device = device_assignment(i, 0);
+      pool->Schedule([this, device, &options]() {
+        se::StreamExecutor* executor =
+            backend().stream_executor(device).ValueOrDie();
+        VLOG(1) << "Starting outfeed on device " << device;
+        for (int64 step = 1;
+             options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
+          auto literal = absl::make_unique<Literal>();
+          TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+              executor, options.outfeed_shape, literal.get()));
+          if (options.outfeed_values != nullptr) {
+            options.outfeed_values->push_back(std::move(literal));
+          }
+          if (step % 100 == 0) {
+            VLOG(1) << "Outfeed step " << step;
+          }
+        }
+      });
+    }
+  }
+
+  LOG(INFO) << "Replicated execution started";
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<ShapedBuffer>> results,
+                      executable->ExecuteOnStreams(service_run_options,
+                                                   argument_buffer_slices));
+  LOG(INFO) << "Replicated execution terminated";
+
+  std::vector<std::unique_ptr<Literal>> exec_results;
+  for (int64 i = 0; i < options.num_replicas; ++i) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<ScopedShapedBuffer> result,
+                        ScopedShapedBuffer::MakeScoped(
+                            results[i].get(), backend().memory_allocator()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                        backend().transfer_manager()->TransferLiteralFromDevice(
+                            streams[i]->parent(), *result));
+    exec_results.push_back(std::move(literal));
+  }
+  return std::move(exec_results);
+}
+
+StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
+    std::unique_ptr<HloModule> module, bool run_hlo_passes) {
+  if (run_hlo_passes) {
+    TF_ASSIGN_OR_RETURN(
+        module, backend().compiler()->RunHloPasses(
+                    std::move(module), backend().default_stream_executor(),
+                    backend().memory_allocator()));
+  }
+  return backend().compiler()->RunBackend(std::move(module),
+                                          backend().default_stream_executor(),
+                                          backend().memory_allocator());
+}
+
+ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
+    int64 device, se::Stream* stream, DeviceAssignment* device_assignment) {
+  ExecutableRunOptions run_options;
+  run_options.set_device_ordinal(device);
+  run_options.set_stream(stream);
+  run_options.set_allocator(backend().memory_allocator());
+  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
+  run_options.set_intra_op_thread_pool(
+      backend().eigen_intra_op_thread_pool_device());
+  if (device_assignment != nullptr) {
+    run_options.set_device_assignment(device_assignment);
+  }
+  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower(),
+                                     backend().inter_op_thread_pool());
+}
+
 Backend& HloRunner::backend() {
   if (!backend_) {
     backend_ = Backend::CreateDefaultBackend().ConsumeValueOrDie();
-    VLOG(1) << "executing on platform " << backend().platform()->Name();
+    VLOG(1) << "Executing on platform " << backend().platform()->Name();
   }
   return *backend_;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 06ce22a5b9fc7b3d6c10857c84196094c0eed303..f54fb44766eb07f402b2946abc83d50d155e47c1 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -16,12 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_RUNNER_H_
 
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -40,9 +44,43 @@ namespace xla {
 // file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
-  HloRunner();
-
-  HloRunner(::perftools::gputools::Platform* platform);
+  // The options used to configure a ExecuteReplicated() call.
+  struct ReplicatedExecuteOptions {
+    // The number of devices the HLO module should be replicated onto.
+    int64 num_replicas = 1;
+
+    // The arguments to be fed to each replica. Since this is used for a
+    // replicated execution, all the arguments are the same for all replicas.
+    std::vector<const Literal*> arguments;
+
+    // If the HLO module being run has an infeed instruction, this will be the
+    // data which will be fed to it, for as many as infeed_steps steps.
+    const Literal* infeed = nullptr;
+
+    // The number of times the infeed literal should be fed to the HLO module.
+    // For a clean exit, this should match the iterations-per-loop parameter
+    // used when generating the HLO module proto (that is usually the main
+    // while bounary counter). A value higher then iterations-per-loop would
+    // lead to infeed threads feeding to a gone computation, while a lower
+    // value would trigger a stuck ExecuteReplicated() call (the computation
+    // will be trying to infeed data which will never come).
+    int64 infeed_steps = -1;
+
+    // The shape of the outfeed operation. If empty, the HLO module does not
+    // generate any outfeed.
+    Shape outfeed_shape;
+
+    // A pointer to a vector where the outfeed values will be stored. If
+    // nullptr, the values will be read and discarded.
+    std::vector<std::unique_ptr<Literal>>* outfeed_values = nullptr;
+
+    // Whether the HLO passes should be run on the input module. Usually
+    // saved modules are coming from after the HLO pass pipeline, so triggering
+    // another run will likely cause errors.
+    bool run_hlo_passes = false;
+  };
+
+  explicit HloRunner(::perftools::gputools::Platform* platform);
 
   ~HloRunner();
 
@@ -86,6 +124,13 @@ class HloRunner {
     return Execute(std::move(module), argument_pointers, run_hlo_passes);
   }
 
+  // Executes a given HLO module into a set of replicas, and returns a map
+  // with the replica number as key, and the corresponding returned literal as
+  // value.
+  StatusOr<std::vector<std::unique_ptr<Literal>>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
@@ -94,9 +139,18 @@ class HloRunner {
   Backend& backend();
 
  private:
-  struct EigenThreadPoolWrapper;
-
-  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run before.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
+  // Creates a ServiceExecutableRunOptions object to configure a run on device,
+  // using the provided stream object. If device_assignment is not nullptr, it
+  // will be used to configure the replication parameters. Replicated executions
+  // should pass the device_assignment parameter.
+  ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
+      int64 device, ::perftools::gputools::Stream* stream,
+      DeviceAssignment* device_assignment);
 
   std::unique_ptr<Backend> backend_;
 };
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index e8e45f1ee968992901988e8b85d4e9ae28f2abe9..1b42349b0b3ad9634bb910b3843affed6a0ca334 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -376,6 +376,16 @@ HloSharding HloSharding::TransformShardedTileShape(
   return HloSharding::Tile(new_tile_shape, tile_assignment());
 }
 
+HloSharding HloSharding::GetSubSharding(const Shape& shape,
+                                        const ShapeIndex& index) const {
+  CHECK(IsTuple());
+
+  ShapeTree<HloSharding> sub_shape_tree(ShapeUtil::GetSubshape(shape, index),
+                                        Replicate());
+  sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {});
+  return Tuple(sub_shape_tree);
+}
+
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
   out << sharding.ToString();
   return out;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 06204acbca30648e73382cb4641139e852664b77..2b8e757f42991f697df37d3d34bfdff6a36bc509 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -175,6 +175,10 @@ class HloSharding {
     }
   }
 
+  // Retrieves the sub sharding at a given index, out of a tuple sharding.
+  // REQUIRES: IsTuple()
+  HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const;
+
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8c875698eb1992719d504d272ca338b05b60e36b..63ec5964eb935239e86233c1ae94e2bcce3b0461 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -174,17 +174,34 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
-  for (int64 operand_dimension = 0;
-       operand_dimension < ShapeUtil::Rank(operand_shape);
-       ++operand_dimension) {
-    int64 output_dimension = broadcast->dimensions()[operand_dimension];
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    int64 output_dimension = broadcast->dimensions()[i];
     TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(operand_dimension))
+                 operand_shape.dimensions(i))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return tensorflow::Status::OK();
 }
 
+Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) {
+  const Shape& operand_shape = broadcastDimOne->operand(0)->shape();
+  int64 operand_rank = ShapeUtil::Rank(operand_shape);
+  const Shape& output_shape = broadcastDimOne->shape();
+  // Check for mixed precision.
+  TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape));
+  TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape));
+  for (int64 i = 0; i < operand_rank; ++i) {
+    int64 operand_dimension = operand_shape.dimensions(i);
+    int64 output_dimension = output_shape.dimensions(i);
+    TF_RET_CHECK(operand_dimension == 1 ||
+                 operand_dimension == output_dimension)
+        << "Dimension " << i << " of broadcastDimOne "
+        << broadcastDimOne->ToString() << " is " << operand_dimension
+        << ", expected 1 or " << output_dimension;
+  }
+  return tensorflow::Status::OK();
+}
+
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51e18dcfe89bd478de87798ba3858119..a4dff977ba268137d8ab94c576b4b511e911806f 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -54,6 +54,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d69ad80bdb4d2eab2d34228be026d7bc0b76efc0..3f4dbf897df7e1fd62f4229ed90c949c59da9d46 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -37,6 +37,7 @@ namespace xla {
     case HloOpcode::kBitcast:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -142,7 +143,8 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
       });
   return std::count_if(hlo->operands().begin(), hlo->operands().end(),
                        [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast) {
+                         if (operand->opcode() == HloOpcode::kBroadcast ||
+                             operand->opcode() == HloOpcode::kBroadcastDimOne) {
                            return false;
                          }
                          if (operand->opcode() == HloOpcode::kConstant &&
@@ -247,7 +249,8 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     auto reachability = computation->ComputeReachability();
 
     auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast) {
+      if (producer->opcode() == HloOpcode::kBroadcast ||
+          producer->opcode() == HloOpcode::kBroadcastDimOne) {
         return true;
       }
       if (producer->opcode() == HloOpcode::kConstant &&
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 9171e859c6f84ceef9664aa1eb90a07c87dfab40..5b9bf5faf366d674ecadd59fa8a0af8d4976a962 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -96,7 +96,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }
 
 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kInterpreterPlatformId;
+  return sep::kXlaInterpreterPlatformId;
 }
 
 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@@ -109,11 +109,11 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kInterpreterPlatformId, []() {
+  xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
     return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
   });
-  xla::ComputationPlacer::RegisterComputationPlacer(sep::kInterpreterPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 68371910d76f42c0b6d4b1adad9d6a83bdb858e6..3caf9e7b82b21a84197ffe60267d6d953f9547a1 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -28,84 +28,85 @@ host::HostStream *AsExecutorStream(Stream *stream) {
   return dynamic_cast<host::HostStream *>(stream->implementation());
 }
 
-InterpreterExecutor::InterpreterExecutor(const PluginConfig &plugin_config)
+XlaInterpreterExecutor::XlaInterpreterExecutor(
+    const PluginConfig &plugin_config)
     : plugin_config_(plugin_config) {}
 
-InterpreterExecutor::~InterpreterExecutor() {}
+XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
-void *InterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
+void *XlaInterpreterExecutor::Allocate(uint64 size) { return new char[size]; }
 
-void *InterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
-                                             uint64 offset_bytes,
-                                             uint64 /*size_bytes*/) {
+void *XlaInterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                                uint64 offset_bytes,
+                                                uint64 /*size_bytes*/) {
   return parent + offset_bytes;
 }
 
-void InterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
+void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
   if (!mem->is_sub_buffer()) {
     delete[] static_cast<char *>(mem->opaque());
   }
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
-                                 const DeviceMemoryBase &dev_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
+                                    const DeviceMemoryBase &dev_src,
+                                    uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
     port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
   });
   return true;
 }
 
-bool InterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64 size) {
+bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                                    const void *host_src, uint64 size) {
   AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
     port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
   });
   return true;
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                    const void *host_src,
-                                                    uint64 size) {
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
+    DeviceMemoryBase *dev_dst, const void *host_src, uint64 size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return port::Status::OK();
 }
 
-port::Status InterpreterExecutor::SynchronousMemcpy(
+port::Status XlaInterpreterExecutor::SynchronousMemcpy(
     void *host_dst, const DeviceMemoryBase &dev_src, uint64 size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return port::Status::OK();
 }
 
-bool InterpreterExecutor::HostCallback(Stream *stream,
-                                       std::function<void()> callback) {
+bool XlaInterpreterExecutor::HostCallback(Stream *stream,
+                                          std::function<void()> callback) {
   AsExecutorStream(stream)->EnqueueTask(callback);
   return true;
 }
 
-bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
-                                                 Stream *other) {
+bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
+                                                    Stream *other) {
   AsExecutorStream(dependent)->EnqueueTask(
       [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsExecutorStream(dependent)->BlockUntilDone();
   return true;
 }
 
-bool InterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
   return true;
 }
 
-bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
+bool XlaInterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
   return true;
 }
 
-port::Status InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
   return port::Status::OK();
 }
 
-DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
+DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   builder.set_device_address_bits(64);
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c5d07e906dafb033905c50c604069e80e1ce80cd..77426b0820d2d4e6a3a3216025837de7fa5e5c65 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Declares the InterpreterExecutor class, which is a CPU-only implementation of
-// the StreamExecutor interface. For now, this is used for testing and to
+// Declares the XlaInterpreterExecutor class, which is a CPU-only implementation
+// of the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
@@ -50,10 +50,10 @@ namespace interpreter {
 
 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
 
-class InterpreterExecutor : public internal::StreamExecutorInterface {
+class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
  public:
-  explicit InterpreterExecutor(const PluginConfig &plugin_config);
-  ~InterpreterExecutor() override;
+  explicit XlaInterpreterExecutor(const PluginConfig &plugin_config);
+  ~XlaInterpreterExecutor() override;
 
   port::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return port::Status::OK();
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index cf98ecd7749d61261bf072cdb1882c7603f39556..3cf8506d1c469d7745d26834a51b4ce0eebaa942 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -26,7 +26,7 @@ namespace sei = ::perftools::gputools::interpreter;
 namespace xla {
 
 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kInterpreterPlatformId,
+    : GenericTransferManager(sei::kXlaInterpreterPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
 }  // namespace xla
@@ -38,7 +38,7 @@ CreateInterpreterTransferManager() {
 
 static bool InitModule() {
   xla::TransferManager::RegisterTransferManager(
-      sei::kInterpreterPlatformId, &CreateInterpreterTransferManager);
+      sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index a60e7fc59f7c5f0b1b24e026b34e195ca0fe5ebb..015e00e1e8edc5c77066b6038f98621862af5440 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -35,17 +35,19 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-InterpreterPlatform::InterpreterPlatform() : name_("Interpreter") {}
+XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
 
-InterpreterPlatform::~InterpreterPlatform() {}
+XlaInterpreterPlatform::~XlaInterpreterPlatform() {}
 
-Platform::Id InterpreterPlatform::id() const { return kInterpreterPlatformId; }
+Platform::Id XlaInterpreterPlatform::id() const {
+  return kXlaInterpreterPlatformId;
+}
 
-int InterpreterPlatform::VisibleDeviceCount() const { return 1; }
+int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
-const string& InterpreterPlatform::Name() const { return name_; }
+const string& XlaInterpreterPlatform::Name() const { return name_; }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
     int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -55,7 +57,7 @@ port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
 }
 
 port::StatusOr<StreamExecutor*>
-InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
+XlaInterpreterPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -64,16 +66,17 @@ InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> InterpreterPlatform::GetExecutor(
+port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
 port::StatusOr<std::unique_ptr<StreamExecutor>>
-InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+XlaInterpreterPlatform::GetUncachedExecutor(
+    const StreamExecutorConfig& config) {
   auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<InterpreterExecutor>(config.plugin_config));
+      this, port::MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
@@ -86,17 +89,17 @@ InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   return std::move(executor);
 }
 
-void InterpreterPlatform::RegisterTraceListener(
+void XlaInterpreterPlatform::RegisterTraceListener(
     std::unique_ptr<TraceListener> listener) {
   LOG(FATAL) << "not yet implemented: register executor trace listener";
 }
 
-void InterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
+void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister executor trace listener";
 }
 
-static void InitializeInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::InterpreterPlatform);
+static void InitializeXlaInterpreterPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
   SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
@@ -105,7 +108,7 @@ static void InitializeInterpreterPlatform() {
 }  // namespace perftools
 
 REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeInterpreterPlatform());
+                            sep::InitializeXlaInterpreterPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index c66ddb907d1c5a8e99d3178a202a77a72a646ce5..2f71b29be4401a8374cdd0bad5830a632305fc26 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -27,10 +27,10 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-class InterpreterPlatform : public Platform {
+class XlaInterpreterPlatform : public Platform {
  public:
-  InterpreterPlatform();
-  ~InterpreterPlatform() override;
+  XlaInterpreterPlatform();
+  ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
 
@@ -60,7 +60,7 @@ class InterpreterPlatform : public Platform {
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(InterpreterPlatform);
+  SE_DISALLOW_COPY_AND_ASSIGN(XlaInterpreterPlatform);
 };
 
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
index 1a0373cf86e26b564e0e732e8de1a0a5d868bfa6..b7fb365b70db7235764435305085e36869cbb13a 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@@ -18,7 +18,7 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-PLATFORM_DEFINE_ID(kInterpreterPlatformId);
+PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
 
 }  // namespace interpreter
 }  // namespace gputools
diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h
index 905efef1690d3bd32461353fe32dd394eb6bca9e..292f958449b52ff2f522bd31f115079b4f7e0835 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@@ -22,7 +22,7 @@ namespace perftools {
 namespace gputools {
 namespace interpreter {
 
-extern const Platform::Id kInterpreterPlatformId;
+extern const Platform::Id kXlaInterpreterPlatformId;
 
 }  // namespace interpreter
 }  // namespace gputools
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 39f9120e552f014dd2759bff2892157402d9c47a..2494569db53f260b900b3d5d3d0d2da5b1fc5f73 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -57,76 +57,6 @@ namespace xla {
 // anonymous namespace, instead of three or four spread all over this file.
 namespace {
 
-// Creates and returns a copy of the given instruction with a different
-// layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
-// instruction producing the copy is returned.
-StatusOr<HloInstruction*> CreateCopyWithNewLayout(
-    const Shape& shape_with_layout, HloInstruction* instruction) {
-  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
-  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
-      << ShapeUtil::HumanString(shape_with_layout) << " "
-      << ShapeUtil::HumanString(instruction->shape())
-      << " instruction: " << instruction->ToString();
-
-  if (ShapeUtil::IsTuple(instruction->shape())) {
-    // Deep-copy tuples.
-    std::vector<HloInstruction*> element_copies;
-    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
-         ++i) {
-      HloInstruction* gte = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-
-      // Recurse to copy each elements.
-      TF_ASSIGN_OR_RETURN(
-          HloInstruction * element_copy,
-          CreateCopyWithNewLayout(
-              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
-      element_copies.push_back(element_copy);
-    }
-    // Gather element copies into a tuple with a new Tuple instruction.
-    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
-        HloInstruction::CreateTuple(element_copies));
-    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, tuple_copy->mutable_shape()));
-    return tuple_copy;
-  } else if (ShapeUtil::IsArray(instruction->shape())) {
-    HloInstruction* copy =
-        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
-            instruction->shape(), HloOpcode::kCopy, instruction));
-    LayoutUtil::ClearLayout(copy->mutable_shape());
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        shape_with_layout, copy->mutable_shape()));
-
-    return copy;
-  } else {
-    return FailedPrecondition(
-        "Can only copy array and tuple shaped instructions");
-  }
-}
-
-// Creates a copy of the given operand if the operand's layout does not match
-// the given layout. This copy replaces the use in the given instruction. Tuple
-// operands will be deep-copied.
-Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                  HloInstruction* instruction,
-                                  int64 operand_no) {
-  HloInstruction* operand = instruction->mutable_operand(operand_no);
-  TF_RET_CHECK(operand_layout.LayoutIsSet());
-  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
-
-  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
-    // Operand layout already matches our constraint. Nothing to do.
-    return Status::OK();
-  }
-
-  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
-                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
-
-  return instruction->ReplaceOperandWith(operand_no, operand_copy);
-}
 
 }  // namespace
 
@@ -793,6 +723,99 @@ Status CheckConstantLayout(HloInstruction* constant) {
 
 }  // namespace
 
+StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
+    const Shape& shape_with_layout, HloInstruction* instruction) {
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
+  DCHECK(ShapeUtil::Compatible(shape_with_layout, instruction->shape()))
+      << ShapeUtil::HumanString(shape_with_layout) << " "
+      << ShapeUtil::HumanString(instruction->shape())
+      << " instruction: " << instruction->ToString();
+
+  if (ShapeUtil::IsTuple(instruction->shape())) {
+    // Deep-copy tuples.
+    std::vector<HloInstruction*> element_copies;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
+         ++i) {
+      HloInstruction* gte = instruction->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
+              i));
+      SetupCopiedInstruction(*instruction, gte, {i});
+      // Recurse to copy each elements.
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * element_copy,
+          CreateCopyWithNewLayout(
+              ShapeUtil::GetSubshape(shape_with_layout, {i}), gte));
+      element_copies.push_back(element_copy);
+    }
+    // Gather element copies into a tuple with a new Tuple instruction.
+    HloInstruction* tuple_copy = instruction->parent()->AddInstruction(
+        HloInstruction::CreateTuple(element_copies));
+    SetupCopiedInstruction(*instruction, tuple_copy, {});
+    LayoutUtil::ClearLayout(tuple_copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, tuple_copy->mutable_shape()));
+    return tuple_copy;
+  } else if (ShapeUtil::IsArray(instruction->shape())) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            instruction->shape(), HloOpcode::kCopy, instruction));
+    SetupCopiedInstruction(*instruction, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        shape_with_layout, copy->mutable_shape()));
+
+    return copy;
+  } else {
+    return FailedPrecondition(
+        "Can only copy array and tuple shaped instructions");
+  }
+}
+
+// Creates a copy of the given operand if the operand's layout does not match
+// the given layout. This copy replaces the use in the given instruction. Tuple
+// operands will be deep-copied.
+Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
+    const ShapeLayout& operand_layout, HloInstruction* instruction,
+    int64 operand_no) {
+  HloInstruction* operand = instruction->mutable_operand(operand_no);
+  TF_RET_CHECK(operand_layout.LayoutIsSet());
+  TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
+
+  if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    // Operand layout already matches our constraint. Nothing to do.
+    return Status::OK();
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
+                      CreateCopyWithNewLayout(operand_layout.shape(), operand));
+
+  return instruction->ReplaceOperandWith(operand_no, operand_copy);
+}
+
+void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction,
+                                              HloInstruction* copy,
+                                              const ShapeIndex& index) {
+  if (instruction.has_sharding()) {
+    // If the index is empty, we want to copy the whole sharding, in case the
+    // sharding is a tuple sharding.
+    HloSharding sharding =
+        !index.empty() && instruction.sharding().IsTuple()
+            ? instruction.sharding().GetSubSharding(instruction.shape(), index)
+            : instruction.sharding();
+    // We propagate the sharding to the copied instruction only if it is a
+    // special sharding, like tiled ones, or special devices like the
+    // HostCompute module.
+    // Otherwise it is preferable to leave the new instruction without device,
+    // and let the automatic device placer to choose the best location.
+    if (!sharding.HasUniqueDevice() ||
+        HloSharding::IsReservedDevice(sharding.UniqueDevice().ValueOrDie())) {
+      copy->set_sharding(sharding);
+    }
+  }
+  copy->set_metadata(instruction.metadata());
+}
+
 Status LayoutAssignment::CheckLayouts(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 680f88048a1f0cd5ede7991640003ef407d4facf..ae4986d6ad9bc3de100eab9cc38b709bb56c7813 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -405,6 +405,29 @@ class LayoutAssignment : public HloPassInterface {
   ComputationLayout* entry_computation_layout_;
 
  protected:
+  // Sets up the copy instruction according to the characteristic (sharding,
+  // metadata, ...) of the reference instruction. The index argument is used
+  // when the instruction is a tuple, and in such case the index represents
+  // the location from where the copy instruction was created from.
+  // If the index is empty, the whole sharding will be propagated, even in case
+  // the intruction has a tuple sharding.
+  static void SetupCopiedInstruction(const HloInstruction& instruction,
+                                     HloInstruction* copy,
+                                     const ShapeIndex& index);
+
+  // Creates and returns a copy of the given instruction with a different
+  // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
+  // instruction producing the copy is returned.
+  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+      const Shape& shape_with_layout, HloInstruction* instruction);
+
+  // Creates a copy of the given operand if the operand's layout does not match
+  // the given layout. This copy replaces the use in the given instruction.
+  // Tuple operands will be deep-copied.
+  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                           HloInstruction* instruction,
+                                           int64 operand_no);
+
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index b6b918ec78a27b90325f72eea14b97f9aee43c54..3978acc132f34b8b195d3772ccf71d0d467984db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -88,12 +88,12 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
-IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
-    return IrArray::Index();
+    return {IrArray::Index()};
   }
 
   // Create loop nest with one for-loop for each dimension of the target shape.
@@ -121,12 +121,14 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK_NOTNULL(exit_bb_);
 
-  return array_index;
+  return {array_index};
 }
 
 tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
-  IrArray::Index array_index = EmitIndexAndSetExitBasicBlock(loop_name);
-  TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  for (const IrArray::Index& array_index :
+       EmitIndexAndSetExitBasicBlock(loop_name)) {
+    TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  }
 
   // Set the insertion point of ir_builder_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 0fc528439a0d5bf8382dfcf2d8b3051f8900bf1d..9ff497aecd0bc964c929205c7fd410cca87d9b77 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -63,11 +63,12 @@ class LoopEmitter {
 
   // Emits a loop nest (with a yet-to-be-filled loop body) that iterates through
   // every element in the given shape. Returns the multi-dimensional index that
-  // specifies the element.
-  IrArray::Index EmitIndexAndSetExitBasicBlock() {
+  // specifies the element, will return multiple indices if the loop is
+  // unrolled.
+  std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
     return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"");
   }
-  virtual IrArray::Index EmitIndexAndSetExitBasicBlock(
+  virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d4963807721eb177400131fa16a69f32fb431ab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -0,0 +1,1014 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// A pattern matcher for HloInstructions, Shapes, and Layouts.
+//
+// The Match function's first argument must be HloInstruction*, Shape*, or
+// Layout*. The second argument is a pattern that will be matched against the
+// first argument, as described below.
+//
+// Patterns are constructed using the match::Op, match::Shape, or match::Layout
+// functions. By default, the returned patterns will match any HloInstruction,
+// Shape, or Layout, respectively. However the match can be made more specific
+// by using the pattern's modifier methods, for example:
+//
+//   match::Op().WithOpcode(HloOpcode::kAdd).WithOperand(
+//     0, match::Op().WithOpcode(HloOpcode::kConstant))
+//
+// This pattern will match Add instructions whose first operand is a constant.
+//
+// Each pattern type has the following modifiers:
+//
+//   Op():
+//     - WithName: match operations with the given name
+//     - WithOpcode: match operations with the given opcode
+//     - WithShape: match operations whose shape matches the given pattern
+//     - WithOperand: match operations whose operand matches the given pattern
+//
+//   Shape():
+//     - EqualTo: matches shapes that are equal to the argument
+//     - CompatibleTo: matches shapes that are compatible to the argument
+//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
+//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
+//     - WithLayout: match shapes whose layout matches the given pattern
+//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
+//     - WithSubshape: matches tuple shapes whose subshape matches the given
+//       pattern
+//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
+//     - WithElementType: matches array/scalar shapes with the given element
+//       type
+//     - WithRank: matches array/scalar types with the given rank
+//
+//  Layout():
+//     - EqualTo: matches layouts that are equal to the argument
+//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
+//       format
+//
+// Op(), Shape(), and Layout() may be passed an argument of type
+// HloInstruction**, Shape**, or Layout**, respectively, or const versions of
+// these pointers. If the pattern is matched, the address of the matched value
+// will be "captured" and stored at this location.
+//
+// For example:
+//   HloInstruction* foo = ...;
+//   HloInstruction* matched_operand;
+//   CHECK(Match(foo,
+//               match::Op().WithOperand(0, match::Op(&matched_operand))));
+//
+// Helpers are provided for common nullary, unary, binary, and ternary
+// instructions. These helpers can be called with no arguments, in which case
+// they will match any instruction matching the opcode. They may also be called
+// with matches for the operands and with an optional capture. (The capture must
+// be the first argument.) Some examples of these helpers and their equivalents
+// are provided below.
+//
+// Example nullary instruction:
+//   Recv()                            == Op().WithOpcode(HloOpcode::kRecv)
+//   Recv(&a)                          == Op(&a).WithOpcode(HloOpcode::kRecv)
+//
+// Example unary instruction:
+//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
+//                                            .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                              .WithOperand(0, Op(&b))
+//
+// Example binary instruction:
+//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
+//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//
+// Example ternary instruction:
+//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
+//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
+//                                            .WithOperand(0, Op(&a))
+//                                            .WithOperand(1, Op(&b))
+//                                            .WithOperand(2, Op(&c))
+//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
+//                                              .WithOperand(0, Op(&b))
+//                                              .WithOperand(1, Op(&c))
+//                                              .WithOperand(2, Op(&d))
+//
+template <typename Value, typename Pattern>
+bool Match(Value* value, const Pattern& pattern) {
+  return pattern.Match(value);
+}
+
+namespace match {
+
+namespace detail {
+
+template <typename LayoutType, typename Impl>
+class LayoutPattern;
+
+// The base LayoutPattern implementation. Matches only if the layout is not
+// nullptr.
+class LayoutPatternBaseImpl {
+ public:
+  bool Match(const ::xla::Layout* layout) const { return layout != nullptr; }
+};
+
+// A LayoutPattern implementation that matches only if the layout equals a
+// Layout proto.
+template <typename Previous>
+class LayoutPatternEqualImpl {
+ public:
+  explicit constexpr LayoutPatternEqualImpl(const Previous& previous,
+                                            const ::xla::Layout* layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && LayoutUtil::Equal(*layout_, *layout);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Layout* layout_;
+};
+
+// A LayoutPattern implementation that matches only if the layout has a given
+// format.
+template <typename Previous>
+class LayoutPatternFormatImpl {
+ public:
+  explicit constexpr LayoutPatternFormatImpl(const Previous& previous,
+                                             Format format)
+      : previous_(previous), format_(format) {}
+
+  bool Match(const ::xla::Layout* layout) const {
+    return previous_.Match(layout) && layout->format() == format_;
+  }
+
+ private:
+  Previous previous_;
+  Format format_;
+};
+
+// A pattern that matches Layouts.
+template <typename LayoutType, typename Impl>
+class LayoutPattern {
+ public:
+  explicit constexpr LayoutPattern(const Impl& impl,
+                                   LayoutType** matched_layout)
+      : impl_(impl), matched_layout_(matched_layout) {}
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(const ::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the layout iff it matches the pattern.
+  bool Match(::xla::Layout* layout) const {
+    if (impl_.Match(layout)) {
+      if (matched_layout_) {
+        *matched_layout_ = layout;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the layout equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
+      const Layout* layout) const {
+    return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
+        LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a dense format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithDenseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, DENSE), matched_layout_);
+  }
+
+  // Modifies the pattern to match only if the layout has a sparse format.
+  constexpr LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>
+  WithSparseFormat() const {
+    return LayoutPattern<LayoutType, LayoutPatternFormatImpl<Impl>>(
+        LayoutPatternFormatImpl<Impl>(impl_, SPARSE), matched_layout_);
+  }
+
+ private:
+  Impl impl_;
+  LayoutType** matched_layout_;
+};
+
+}  // namespace detail
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<const ::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(const ::xla::Layout** matched_layout = nullptr) {
+  return detail::LayoutPattern<const ::xla::Layout,
+                               detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+// Creates a layout pattern that will capture the matched layout in the
+// argument.
+inline constexpr detail::LayoutPattern<::xla::Layout,
+                                       detail::LayoutPatternBaseImpl>
+Layout(::xla::Layout** matched_layout) {
+  return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
+      detail::LayoutPatternBaseImpl(), matched_layout);
+}
+
+namespace detail {
+
+template <typename ShapeType, typename Impl>
+class ShapePattern;
+
+// The base ShapePattern implementation. Matches only if the shape is not
+// nullptr.
+class ShapePatternBaseImpl {
+ public:
+  bool Match(const ::xla::Shape* shape) const { return shape != nullptr; }
+};
+
+// A ShapePattern implementation that matches only if the shape equals a Shape
+// proto.
+template <typename Previous>
+class ShapePatternEqualImpl {
+ public:
+  explicit constexpr ShapePatternEqualImpl(const Previous& previous,
+                                           const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Equal(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape is compatible to
+// a Shape proto.
+template <typename Previous>
+class ShapePatternCompatibleImpl {
+ public:
+  explicit constexpr ShapePatternCompatibleImpl(const Previous& previous,
+                                                const ::xla::Shape* shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Compatible(*shape_, *shape);
+  }
+
+ private:
+  Previous previous_;
+  const ::xla::Shape* shape_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// element type.
+template <typename Previous>
+class ShapePatternElementTypeImpl {
+ public:
+  explicit constexpr ShapePatternElementTypeImpl(const Previous& previous,
+                                                 PrimitiveType element_type)
+      : previous_(previous), element_type_(element_type) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && shape->element_type() == element_type_;
+  }
+
+ private:
+  Previous previous_;
+  PrimitiveType element_type_;
+};
+
+// A ShapePattern implementation that matches only if the shape is scalar.
+template <typename Previous>
+class ShapePatternIsScalarImpl {
+ public:
+  explicit constexpr ShapePatternIsScalarImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsScalar(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is an array
+template <typename Previous>
+class ShapePatternIsArrayImpl {
+ public:
+  explicit constexpr ShapePatternIsArrayImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsArray(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape is a tuple.
+template <typename Previous>
+class ShapePatternIsTupleImpl {
+ public:
+  explicit constexpr ShapePatternIsTupleImpl(const Previous& previous)
+      : previous_(previous) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IsTuple(*shape);
+  }
+
+ private:
+  Previous previous_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a given
+// rank.
+template <typename Previous>
+class ShapePatternRankImpl {
+ public:
+  explicit constexpr ShapePatternRankImpl(const Previous& previous, int64 rank)
+      : previous_(previous), rank_(rank) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::Rank(*shape) == rank_;
+  }
+
+ private:
+  Previous previous_;
+  int64 rank_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a layout
+// that matches a given pattern.
+template <typename Previous, typename LayoutType, typename LayoutImpl>
+class ShapePatternLayoutImpl {
+ public:
+  explicit constexpr ShapePatternLayoutImpl(
+      const Previous& previous,
+      const LayoutPattern<LayoutType, LayoutImpl>& layout)
+      : previous_(previous), layout_(layout) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(&shape->layout());
+  }
+
+  bool Match(Shape* shape) const {
+    return previous_.Match(shape) && LayoutUtil::HasLayout(*shape) &&
+           layout_.Match(shape->mutable_layout());
+  }
+
+ private:
+  Previous previous_;
+  LayoutPattern<LayoutType, LayoutImpl> layout_;
+};
+
+// A ShapePattern implementation that matches only if the shape has a subshape
+// that matches a given pattern.
+template <typename Previous, typename SubshapeType, typename SubshapeImpl>
+class ShapePatternSubshapeImpl {
+ public:
+  explicit ShapePatternSubshapeImpl(
+      const Previous& previous, ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
+      : previous_(previous), index_(index), subshape_(subshape) {}
+
+  bool Match(const ::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_));
+  }
+
+  bool Match(::xla::Shape* shape) const {
+    return previous_.Match(shape) && ShapeUtil::IndexIsValid(*shape, index_) &&
+           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_));
+  }
+
+ private:
+  Previous previous_;
+  ShapeIndexView index_;
+  ShapePattern<SubshapeType, SubshapeImpl> subshape_;
+};
+
+// A pattern that matches Shapes.
+template <typename ShapeType, typename Impl>
+class ShapePattern {
+ public:
+  explicit constexpr ShapePattern(const Impl& impl, ShapeType** matched_shape)
+      : impl_(impl), matched_shape_(matched_shape) {}
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(const ::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the shape iff it matches the pattern.
+  bool Match(::xla::Shape* shape) const {
+    if (impl_.Match(shape)) {
+      if (matched_shape_) {
+        *matched_shape_ = shape;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the shape equals the given proto.
+  // The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>> EqualTo(
+      const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternEqualImpl<Impl>>(
+        ShapePatternEqualImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is compatible to the given
+  // proto. The layout must outlive the returned pattern.
+  constexpr ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>
+  CompatibleTo(const ::xla::Shape* shape) const {
+    return ShapePattern<ShapeType, ShapePatternCompatibleImpl<Impl>>(
+        ShapePatternCompatibleImpl<Impl>(impl_, shape), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given element type.
+  constexpr ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>
+  WithElementType(PrimitiveType element_type) const {
+    return ShapePattern<ShapeType, ShapePatternElementTypeImpl<Impl>>(
+        ShapePatternElementTypeImpl<Impl>(impl_, element_type), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is scalar.
+  constexpr ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>> IsScalar()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsScalarImpl<Impl>>(
+        ShapePatternIsScalarImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is an array.
+  constexpr ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>> IsArray()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsArrayImpl<Impl>>(
+        ShapePatternIsArrayImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape is a tuple.
+  constexpr ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>> IsTuple()
+      const {
+    return ShapePattern<ShapeType, ShapePatternIsTupleImpl<Impl>>(
+        ShapePatternIsTupleImpl<Impl>(impl_), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has the given rank.
+  constexpr ShapePattern<ShapeType, ShapePatternRankImpl<Impl>> WithRank(
+      int64 rank) const {
+    return ShapePattern<ShapeType, ShapePatternRankImpl<Impl>>(
+        ShapePatternRankImpl<Impl>(impl_, rank), matched_shape_);
+  }
+
+  // Modifies the pattern to match only if the shape has a layout that matches
+  // the given pattern.
+  template <typename LayoutType, typename LayoutImpl>
+  constexpr ShapePattern<ShapeType,
+                         ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>
+  WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
+    return ShapePattern<ShapeType,
+                        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>>(
+        ShapePatternLayoutImpl<Impl, LayoutType, LayoutImpl>(impl_, layout),
+        matched_shape_);
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternEqualImpl<LayoutPatternBaseImpl>>>
+  WithLayoutEqualTo(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().EqualTo(layout));
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsDenseArray(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().WithDenseFormat());
+  }
+
+  constexpr ShapePattern<
+      ShapeType,
+      ShapePatternLayoutImpl<Impl, const ::xla::Layout,
+                             LayoutPatternFormatImpl<LayoutPatternBaseImpl>>>
+  IsSparseArray(const ::xla::Layout* layout) const {
+    return WithLayout(Layout().WithSparseFormat());
+  }
+
+  // Modifies the pattern to match only if the shape has a subshape that matches
+  // the given pattern.
+  template <typename SubshapeType, typename SubshapeImpl>
+  ShapePattern<ShapeType,
+               ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>
+  WithSubshape(ShapeIndexView index,
+               const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
+    return ShapePattern<
+        ShapeType, ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>>(
+        ShapePatternSubshapeImpl<Impl, SubshapeType, SubshapeImpl>(impl_, index,
+                                                                   subshape),
+        matched_shape_);
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternEqualImpl<ShapePatternBaseImpl>>>
+  WithSubshapeEqualTo(ShapeIndexView index, const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .EqualTo(shape));
+  }
+
+  ShapePattern<ShapeType, ShapePatternSubshapeImpl<
+                              Impl, const ::xla::Shape,
+                              ShapePatternCompatibleImpl<ShapePatternBaseImpl>>>
+  WithSubshapeCompatibleTo(ShapeIndexView index,
+                           const ::xla::Shape* shape) const {
+    return WithSubshape(index,
+                        ShapePattern<const ::xla::Shape, ShapePatternBaseImpl>(
+                            ShapePatternBaseImpl(), nullptr)
+                            .CompatibleTo(shape));
+  }
+
+ private:
+  Impl impl_;
+  ShapeType** matched_shape_;
+};
+
+}  // namespace detail
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<const ::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(const ::xla::Shape** matched_shape = nullptr) {
+  return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+// Creates a shape pattern that will capture the matched layout in the argument.
+inline constexpr detail::ShapePattern<::xla::Shape,
+                                      detail::ShapePatternBaseImpl>
+Shape(::xla::Shape** matched_shape) {
+  return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
+      detail::ShapePatternBaseImpl(), matched_shape);
+}
+
+namespace detail {
+
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern;
+
+// The base HloInstructionPattern implementation. Matches only if the
+// instruction is not nullptr.
+class HloInstructionPatternBaseImpl {
+ public:
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return inst != nullptr;
+  }
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given name.
+template <typename Previous>
+class HloInstructionPatternNameImpl {
+ public:
+  explicit HloInstructionPatternNameImpl(const Previous& previous,
+                                         tensorflow::StringPiece name)
+      : previous_(previous), name_(name) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->name() == name_;
+  }
+
+ private:
+  Previous previous_;
+  tensorflow::StringPiece name_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a given opcode.
+template <typename Previous>
+class HloInstructionPatternOpcodeImpl {
+ public:
+  explicit constexpr HloInstructionPatternOpcodeImpl(const Previous& previous,
+                                                     HloOpcode opcode,
+                                                     bool invert)
+      : previous_(previous), opcode_(opcode), invert_(invert) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && (invert_ ^ (inst->opcode() == opcode_));
+  }
+
+ private:
+  Previous previous_;
+  HloOpcode opcode_;
+  bool invert_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has a shape that matches a given pattern.
+template <typename Previous, typename ShapeType, typename ShapeImpl>
+class HloInstructionPatternShapeImpl {
+ public:
+  explicit constexpr HloInstructionPatternShapeImpl(
+      const Previous& previous, const ShapePattern<ShapeType, ShapeImpl>& shape)
+      : previous_(previous), shape_(shape) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(&inst->shape());
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && shape_.Match(inst->mutable_shape());
+  }
+
+ private:
+  Previous previous_;
+  ShapePattern<ShapeType, ShapeImpl> shape_;
+};
+
+// An HloInstructionPattern implementation that matches only if the instruction
+// has an operand that matches a given pattern.
+template <typename Previous, typename OperandType, typename OperandImpl>
+class HloInstructionPatternOperandImpl {
+ public:
+  explicit constexpr HloInstructionPatternOperandImpl(
+      const Previous& previous, int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand)
+      : previous_(previous), operand_index_(operand_index), operand_(operand) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->operand(operand_index_));
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && operand_index_ < inst->operand_count() &&
+           operand_.Match(inst->mutable_operand(operand_index_));
+  }
+
+ private:
+  Previous previous_;
+  int64 operand_index_;
+  HloInstructionPattern<OperandType, OperandImpl> operand_;
+};
+
+// A pattern that matches HloInstructions.
+template <typename HloInstructionType, typename Impl>
+class HloInstructionPattern {
+ public:
+  explicit constexpr HloInstructionPattern(const Impl& impl,
+                                           HloInstructionType** matched_inst)
+      : impl_(impl), matched_inst_(matched_inst) {}
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(const ::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Returns true and captures the instruction iff it matches the pattern.
+  bool Match(::xla::HloInstruction* inst) const {
+    if (impl_.Match(inst)) {
+      if (matched_inst_) {
+        *matched_inst_ = inst;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  // Modifies the pattern to match only if the instruction has the given name.
+  HloInstructionPattern<HloInstructionType, HloInstructionPatternNameImpl<Impl>>
+  WithName(tensorflow::StringPiece name) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternNameImpl<Impl>>(
+        HloInstructionPatternNameImpl<Impl>(impl_, name), matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has the given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, false),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction does not have the
+  // given opcode.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  WithoutOpcode(HloOpcode opcode) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternOpcodeImpl<Impl>>(
+        HloInstructionPatternOpcodeImpl<Impl>(impl_, opcode, true),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction is a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsConstant() const {
+    return WithOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction is not a constant.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternOpcodeImpl<Impl>>
+  IsNonConstant() const {
+    return WithoutOpcode(HloOpcode::kConstant);
+  }
+
+  // Modifies the pattern to match only if the instruction has a shape that
+  // matches the given pattern.
+  template <typename ShapeType, typename ShapeImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>
+  WithShape(const ShapePattern<ShapeType, ShapeImpl>& shape) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>>(
+        HloInstructionPatternShapeImpl<Impl, ShapeType, ShapeImpl>(impl_,
+                                                                   shape),
+        matched_inst_);
+  }
+
+  // Modifies the pattern to match only if the instruction has an operand that
+  // matches the given pattern.
+  template <typename OperandType, typename OperandImpl>
+  constexpr HloInstructionPattern<
+      HloInstructionType,
+      HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>
+  WithOperand(
+      int64 operand_index,
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
+    return HloInstructionPattern<
+        HloInstructionType,
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>>(
+        HloInstructionPatternOperandImpl<Impl, OperandType, OperandImpl>(
+            impl_, operand_index, operand),
+        matched_inst_);
+  }
+
+ private:
+  Impl impl_;
+  HloInstructionType** matched_inst_;
+};
+
+}  // namespace detail
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    const ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+  return detail::HloInstructionPattern<const ::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Creates an instruction pattern that will capture the matched instruction in
+// the argument.
+inline constexpr detail::HloInstructionPattern<
+    ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
+Op(::xla::HloInstruction** matched_inst) {
+  return detail::HloInstructionPattern<::xla::HloInstruction,
+                                       detail::HloInstructionPatternBaseImpl>(
+      detail::HloInstructionPatternBaseImpl(), matched_inst);
+}
+
+// Helpers for nullary instructions.
+#define XLA_NULLOP_PATTERN(NAME)                                      \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType>                              \
+  inline auto NAME(HloInstructionType** matched_inst)                 \
+      ->decltype(Op(matched_inst).WithOpcode(HloOpcode::k##NAME)) {   \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);           \
+  }
+XLA_NULLOP_PATTERN(Constant)
+XLA_NULLOP_PATTERN(Infeed)
+XLA_NULLOP_PATTERN(Parameter)
+XLA_NULLOP_PATTERN(Recv)
+#undef XLA_NULLOP_PATTERN
+
+// Helpers for unary instructions.
+#define XLA_UNOP_PATTERN(NAME)                                        \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
+    return Op().WithOpcode(HloOpcode::k##NAME);                       \
+  }                                                                   \
+                                                                      \
+  template <typename Arg>                                             \
+  inline auto NAME(Arg&& arg)->decltype(                              \
+      Op().WithOpcode(HloOpcode::k##NAME)                             \
+          .WithOperand(0, std::forward<Arg>(arg))) {                  \
+    return Op()                                                       \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }                                                                   \
+                                                                      \
+  template <typename HloInstructionType, typename Arg>                \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg)      \
+      ->decltype(Op(matched_inst)                                     \
+                     .WithOpcode(HloOpcode::k##NAME)                  \
+                     .WithOperand(0, std::forward<Arg>(arg))) {       \
+    return Op(matched_inst)                                           \
+        .WithOpcode(HloOpcode::k##NAME)                               \
+        .WithOperand(0, std::forward<Arg>(arg));                      \
+  }
+XLA_UNOP_PATTERN(Abs)
+XLA_UNOP_PATTERN(RoundNearestAfz)
+XLA_UNOP_PATTERN(Bitcast)
+XLA_UNOP_PATTERN(Broadcast)
+XLA_UNOP_PATTERN(BroadcastDimOne)
+XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Copy)
+XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(Exp)
+XLA_UNOP_PATTERN(Fft)
+XLA_UNOP_PATTERN(Floor)
+XLA_UNOP_PATTERN(Imag)
+XLA_UNOP_PATTERN(IsFinite)
+XLA_UNOP_PATTERN(Log)
+XLA_UNOP_PATTERN(Not)
+XLA_UNOP_PATTERN(Negate)
+XLA_UNOP_PATTERN(Outfeed)
+XLA_UNOP_PATTERN(Real)
+XLA_UNOP_PATTERN(Reduce)
+XLA_UNOP_PATTERN(ReducePrecision)
+XLA_UNOP_PATTERN(Reshape)
+XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(Send)
+XLA_UNOP_PATTERN(Sign)
+XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Sort)
+XLA_UNOP_PATTERN(Tanh)
+XLA_UNOP_PATTERN(Transpose)
+#undef XLA_UNOP_PATTERN
+
+// Helpers for binary instructions.
+#define XLA_BINOP_PATTERN(NAME)                                             \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {       \
+    return Op().WithOpcode(HloOpcode::k##NAME);                             \
+  }                                                                         \
+                                                                            \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                    \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op()                                                             \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }                                                                         \
+                                                                            \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                \
+                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithOperand(0, std::forward<Lhs>(lhs))                             \
+        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+  }
+XLA_BINOP_PATTERN(Add)
+XLA_BINOP_PATTERN(Atan2)
+XLA_BINOP_PATTERN(Divide)
+XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(Eq)
+XLA_BINOP_PATTERN(Gather)
+XLA_BINOP_PATTERN(Ge)
+XLA_BINOP_PATTERN(Gt)
+XLA_BINOP_PATTERN(Le)
+XLA_BINOP_PATTERN(Lt)
+XLA_BINOP_PATTERN(Maximum)
+XLA_BINOP_PATTERN(Minimum)
+XLA_BINOP_PATTERN(Multiply)
+XLA_BINOP_PATTERN(Ne)
+XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(Remainder)
+XLA_BINOP_PATTERN(Subtract)
+XLA_BINOP_PATTERN(And)
+XLA_BINOP_PATTERN(Or)
+XLA_BINOP_PATTERN(ShiftLeft)
+XLA_BINOP_PATTERN(ShiftRightArithmetic)
+XLA_BINOP_PATTERN(ShiftRightLogical)
+#undef XLA_BINOP_PATTERN
+
+// Helpers for ternary instructions.
+#define XLA_TERNOP_PATTERN(NAME)                                       \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {  \
+    return Op().WithOpcode(HloOpcode::k##NAME);                        \
+  }                                                                    \
+                                                                       \
+  template <typename Arg0, typename Arg1, typename Arg2>               \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2)              \
+      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op()                                                        \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }                                                                    \
+                                                                       \
+  template <typename HloInstructionType, typename Arg0, typename Arg1, \
+            typename Arg2>                                             \
+  inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
+                   Arg1&& arg1, Arg2&& arg2)                           \
+      ->decltype(Op(matched_inst)                                      \
+                     .WithOpcode(HloOpcode::k##NAME)                   \
+                     .WithOperand(0, std::forward<Arg0>(arg0))         \
+                     .WithOperand(1, std::forward<Arg1>(arg1))         \
+                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+    return Op(matched_inst)                                            \
+        .WithOpcode(HloOpcode::k##NAME)                                \
+        .WithOperand(0, std::forward<Arg0>(arg0))                      \
+        .WithOperand(1, std::forward<Arg1>(arg1))                      \
+        .WithOperand(2, std::forward<Arg2>(arg2));                     \
+  }
+XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Select);
+#undef XLA_TERNOP_PATTERN
+
+// Helpers for matching non-constant instructions.
+inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
+  return Op().IsNonConstant();
+}
+
+template <typename HloInstructionType>
+inline auto NonConstant(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsNonConstant()) {
+  return Op(matched_inst).IsNonConstant();
+}
+
+}  // namespace match
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5291b1437afc67312382fe52bf9a66a1843b1b4c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(PatternMatcherTest, AddOp) {
+  constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
+    ENTRY %two_plus_two_computation () -> f32[] {
+      %two = f32[] constant(2)
+      ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+
+  const HloInstruction* matched_inst;
+  HloInstruction* matched_operand;
+  Shape* matched_shape;
+  Layout* matched_layout;
+
+  ASSERT_TRUE(Match(
+      hlo_module->entry_computation()->root_instruction(),
+      match::Op(&matched_inst)
+          .WithName("two_plus_two")
+          .WithOpcode(HloOpcode::kAdd)
+          .WithShape(
+              match::Shape(&matched_shape)
+                  .WithLayout(match::Layout(&matched_layout).WithDenseFormat()))
+          .WithOperand(
+              0,
+              match::Op(&matched_operand).WithOpcode(HloOpcode::kConstant))));
+  ASSERT_NE(matched_inst, nullptr);
+  EXPECT_EQ(matched_inst->name(), "two_plus_two");
+  EXPECT_EQ(matched_inst->opcode(), HloOpcode::kAdd);
+
+  EXPECT_TRUE(Match(hlo_module->entry_computation()->root_instruction(),
+                    match::Add(match::Constant(), match::Constant())));
+
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Op().WithName("bad_name")));
+  matched_inst = nullptr;
+  EXPECT_FALSE(Match(hlo_module->entry_computation()->root_instruction(),
+                     match::Multiply(&matched_inst, match::Op(), match::Op())));
+}
+
+TEST(PatternMatcherTest, ScalarShape) {
+  auto scalar_shape = ShapeUtil::MakeShape(F32, {});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar()));
+  EXPECT_EQ(matched_shape, &scalar_shape);
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray()));
+  EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0)));
+  EXPECT_FALSE(Match(
+      &scalar_shape,
+      match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32)));
+}
+
+TEST(PatternMatcherTest, ArrayShape) {
+  auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4});
+  Shape* matched_shape;
+  EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray()));
+  EXPECT_EQ(matched_shape, &array_shape);
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar()));
+  EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple()));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32)));
+  EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3)));
+  EXPECT_FALSE(
+      Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape())));
+  Layout* matched_layout;
+  EXPECT_FALSE(Match(&array_shape,
+                     match::Shape().WithLayout(
+                         match::Layout(&matched_layout).WithSparseFormat())));
+}
+
+TEST(PatternMatcherTest, TupleShape) {
+  auto tuple_shape = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1, 2, 3}),
+      ShapeUtil::MakeShape(S32, {4, 5}),
+  });
+  EXPECT_TRUE(Match(&tuple_shape, match::Shape().IsTuple()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsArray()));
+  EXPECT_FALSE(Match(&tuple_shape, match::Shape().IsScalar()));
+
+  Shape* subshape;
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {0}, match::Shape(&subshape).WithElementType(F32).WithRank(3))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {0})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {0}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {0}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+
+  ASSERT_TRUE(Match(
+      &tuple_shape,
+      match::Shape().WithSubshape(
+          {1}, match::Shape(&subshape).WithElementType(S32).WithRank(2))));
+  ASSERT_NE(subshape, nullptr);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(*subshape, ShapeUtil::GetSubshape(tuple_shape, {1})));
+  EXPECT_TRUE(Match(&tuple_shape,
+                    match::Shape().WithSubshape(
+                        {1}, match::Shape().EqualTo(
+                                 &ShapeUtil::GetSubshape(tuple_shape, {1})))));
+  EXPECT_FALSE(Match(&tuple_shape,
+                     match::Shape().WithSubshape(
+                         {1}, match::Shape().EqualTo(
+                                  &ShapeUtil::GetSubshape(tuple_shape, {0})))));
+
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({2}, match::Shape())));
+  EXPECT_FALSE(
+      Match(&tuple_shape, match::Shape().WithSubshape({0, 0}, match::Shape())));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ec883a6cf3ce9546ac54f5c2524a8eda53bad33f..52500e4e79042c51d4bea17dea6845ed23433d6c 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1544,6 +1544,50 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
+  if (arg->has_output_layout()) {
+    result_literal = result_literal->Relayout(arg->output_layout());
+  }
+  *result->mutable_literal() = result_literal->ToProto();
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Service::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("program shape may not be empty");
+  }
+  if (arg->computation().program_shape().parameters_size() != 0) {
+    return InvalidArgument(
+        "constant computation may not depend on any parameters.");
+  }
+
+  ProgramShape program_shape = arg->computation().program_shape();
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  if (arg->has_output_layout()) {
+    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
+        arg->output_layout(), program_shape.result()));
+  }
+
+  HloModuleConfig config(program_shape);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  HloEvaluator evaluator;
+  TF_ASSIGN_OR_RETURN(auto result_literal,
+                      evaluator.Evaluate<std::unique_ptr<Literal>>(
+                          *module, /*arg_literals=*/{}));
+
+  // Since the result layout is non-effective to the Evaluator results, explicit
+  // relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
   if (arg->has_output_layout()) {
     result_literal = result_literal->Relayout(arg->output_layout());
   }
@@ -1617,7 +1661,14 @@ tensorflow::Status Service::GetComputationStats(
 
 tensorflow::Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
-  HloModuleConfig config;
+  if (!arg->has_computation()) {
+    return InvalidArgument("Computations may not be empty.");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("Program shape may not be empty.");
+  }
+
+  HloModuleConfig config(arg->computation().program_shape());
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       HloModule::CreateFromProto(arg->computation(), config));
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 9fa72c1b8c859b2b4a1b79abb32a63560dae8ec4..e399f1ac1904f8d6145f43b0ed12d8018765d9a1 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -206,6 +206,9 @@ class Service : public ServiceInterface {
   // Computes the value of a constant expression.
   tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
                                      ComputeConstantResponse* result) override;
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 32aae64973dbd7ac2f8d403d8fbd155d432642f9..5b44c26b7c7b082556d9533cf3b3b1b98e5e4b09 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -112,6 +112,10 @@ class ServiceInterface {
   virtual tensorflow::Status ComputeConstant(
       const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
 
+  virtual tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) = 0;
+
   // Methods used by Computation.
   virtual tensorflow::Status SnapshotComputation(
       const SnapshotComputationRequest* ag,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 6825d2476587d037aace043230168f78f4e46344..ac7e201bfdceabdd0f11db61bbb3b460017401ca 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -824,6 +824,18 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return new_shape;
 }
 
+/* static */ bool ShapeUtil::IndexIsValid(const Shape& shape,
+                                          ShapeIndexView index) {
+  const Shape* subshape = &shape;
+  for (auto i : index) {
+    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size()) {
+      return false;
+    }
+    subshape = &subshape->tuple_shapes(i);
+  }
+  return true;
+}
+
 /* static */ const Shape& ShapeUtil::GetSubshape(const Shape& shape,
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 3e130a02e2ce853ee157e46afb9760f5ff5a5026..63da9154cfc1a5e7e8c0eeaa103d27096540fefe 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,8 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -317,6 +319,11 @@ class ShapeUtil {
   // Returns an empty tuple shape. Can be used to indicate side-effects.
   static Shape MakeNil() { return MakeTupleShape({}); }
 
+  // Checks whether the shape is initialized.
+  static bool IsInitialized(const Shape& shape) {
+    return shape.element_type() != PRIMITIVE_TYPE_INVALID;
+  }
+
   // Constructs a new shape with the given element type and sequence of
   // dimensions.
   static Shape MakeShape(PrimitiveType element_type,
@@ -441,6 +448,9 @@ class ShapeUtil {
   static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
                       std::initializer_list<int64> dimensions);
 
+  // Returns true if the given shape has a subshape at the given index.
+  static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
+
   // GetSubshape and GetMutableSubshape return a particular nested Shape within
   // the given Shape argument.
   static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index);
@@ -583,34 +593,7 @@ class ShapeUtil {
                                        tensorflow::gtl::ArraySlice<int64> count,
                                        tensorflow::gtl::ArraySlice<int64> incr,
                                        const FnType& visitor_function) {
-    if (ShapeUtil::HasZeroElements(shape)) {
-      return Status::OK();
-    }
-    CHECK_EQ(Rank(shape), base.size());
-    CHECK_EQ(incr.size(), base.size());
-    CHECK_EQ(count.size(), base.size());
-    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
-    // Allows handling R0 arrays, such that the visitor function will be called
-    // once with the proper empty indexes.
-    int64 n = -1;
-    std::vector<int64> indexes(base.begin(), base.end());
-    while (n < rank) {
-      TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
-      if (!should_continue) {
-        break;
-      }
-      // Increments dimensions in minor to major order.
-      for (n = 0; n < rank; ++n) {
-        int64 dim = LayoutUtil::Minor(shape.layout(), n);
-        indexes[dim] += incr[dim];
-        if (indexes[dim] < base[dim] + count[dim]) {
-          break;
-        }
-        indexes[dim] = base[dim];
-      }
-    }
-
-    return Status::OK();
+    return ForEachIndexInternal(shape, base, count, incr, visitor_function);
   }
 
   // Simple ergonomic wrapper around ShapeUtil::ForEachIndexWithStatus.
@@ -642,11 +625,83 @@ class ShapeUtil {
         .IgnoreError();
   }
 
+  // A parallel version of ForEachIndex(WithStatus). This can only be used if
+  // the visitor_function is thread-safe and the order of iteration does not
+  // matter.
+  //
+  // visitor_function must be a callable of type
+  // void(ArraySlice<int64>) or compatible.
+  template <typename FnType>
+  static void ForEachIndexParallel(const Shape& shape,
+                                   tensorflow::gtl::ArraySlice<int64> base,
+                                   tensorflow::gtl::ArraySlice<int64> count,
+                                   tensorflow::gtl::ArraySlice<int64> incr,
+                                   const FnType& visitor_function) {
+    // The parallel version of ForEachIndexInternal can never fail.
+    CHECK(ForEachIndexInternal(
+              shape, base, count, incr,
+              [&visitor_function](tensorflow::gtl::ArraySlice<int64> indexes)
+                  -> StatusOr<bool> {
+                visitor_function(indexes);
+                return true;
+              },
+              /*parallel=*/true)
+              .ok());
+  }
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
 
+  template <typename FnType>
+  static Status ForEachIndexInternal(const Shape& shape,
+                                     tensorflow::gtl::ArraySlice<int64> base,
+                                     tensorflow::gtl::ArraySlice<int64> count,
+                                     tensorflow::gtl::ArraySlice<int64> incr,
+                                     const FnType& visitor_function,
+                                     bool parallel = false) {
+    if (ShapeUtil::HasZeroElements(shape)) {
+      return Status::OK();
+    }
+    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(incr.size(), base.size());
+    CHECK_EQ(count.size(), base.size());
+    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
+    // Allows handling R0 arrays, such that the visitor function will be called
+    // once with the proper empty indexes.
+    int64 n = -1;
+    std::vector<int64> indexes(base.begin(), base.end());
+    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
+    tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+    if (parallel) {
+      pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
+    }
+
+    while (n < rank) {
+      if (pool != tensorflow::gtl::nullopt) {
+        pool->Schedule(
+            [indexes, &visitor_function] { visitor_function(indexes); });
+      } else {
+        TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
+        if (!should_continue) {
+          break;
+        }
+      }
+      // Increments dimensions in minor to major order.
+      for (n = 0; n < rank; ++n) {
+        int64 dim = LayoutUtil::Minor(shape.layout(), n);
+        indexes[dim] += incr[dim];
+        if (indexes[dim] < base[dim] + count[dim]) {
+          break;
+        }
+        indexes[dim] = base[dim];
+      }
+    }
+
+    return Status::OK();
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 424cfe37ea44d64884e08695fd1f49ca1970ca62..13582a2a2678548dfc8e9c329dfb6def9d51fc9d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -624,6 +624,24 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   EXPECT_EQ(invocations, 5);
 }
 
+TEST(ShapeUtilTest, ForEachIndexParallel) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64 output[10][10];
+  int init = 5;
+  auto set_func = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(output[i][j], init + i + j);
+    }
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6f58c20f34e30324ca36dbc7fa78ebb82a4b435d..1f90a44d8ba725c1bc7d23b581161f8915ff74fd 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -191,6 +191,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -413,6 +415,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -488,9 +492,10 @@ xla_test(
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -639,9 +644,9 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -777,10 +782,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -856,11 +861,11 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1063,6 +1068,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1085,10 +1092,11 @@ xla_test_library(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1125,11 +1133,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1266,9 +1274,9 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1372,11 +1380,10 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1394,8 +1401,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1477,11 +1484,14 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1545,6 +1555,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1961,3 +1973,16 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+xla_test(
+    name = "test_utils_test",
+    srcs = ["test_utils_test.cc"],
+    deps = [
+        ":local_client_test_base",
+        ":test_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index af8af99c791e2a40cfcfa2291b786b33e5652267..f3dac75a44b948c4b45b80b93e7462073010979e 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -69,14 +69,12 @@ class BatchNormalizationTest
     CHECK_EQ(kY, input_array_.width());
   }
 
-  ComputationDataHandle CheckShape(ComputationBuilder* b,
-                                   const ComputationDataHandle& operand,
-                                   const Shape& expected_shape) const {
-    std::unique_ptr<Shape> actual_shape =
-        b->GetShape(operand).ConsumeValueOrDie();
-    CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
+  XlaOp CheckShape(XlaBuilder* b, const XlaOp& operand,
+                   const Shape& expected_shape) const {
+    Shape actual_shape = b->GetShape(operand).ConsumeValueOrDie();
+    CHECK(ShapeUtil::Equal(expected_shape, actual_shape))
         << "want " << ShapeUtil::HumanString(expected_shape) << " got "
-        << ShapeUtil::HumanString(*actual_shape);
+        << ShapeUtil::HumanString(actual_shape);
     return operand;
   }
 
@@ -102,7 +100,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
 #endif
 
 XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
-  ComputationBuilder builder(client_, "subtract_in_z_one_sample");
+  XlaBuilder builder("subtract_in_z_one_sample");
   auto x = builder.ConstantLiteral(input_literal_);
   auto y = builder.ConstantR1<float>({3.14, 4.25});
   builder.Sub(x, y, /*broadcast_dimensions=*/{1});
@@ -118,7 +116,7 @@ XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
-  ComputationBuilder builder(client_, "square_tesseract_elementwise");
+  XlaBuilder builder("square_tesseract_elementwise");
   auto x = builder.ConstantLiteral(input_literal_);
   builder.SquareF32(x);
 
@@ -135,9 +133,9 @@ XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SumToZ) {
-  ComputationBuilder builder(client_, "sum_to_z");
+  XlaBuilder builder("sum_to_z");
   auto input_activations = builder.ConstantLiteral(input_literal_);
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all but the Z dimension.
   builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                  {0, 2, 3});
@@ -147,24 +145,23 @@ XLA_TEST_P(BatchNormalizationTest, SumToZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) {
-  ComputationBuilder builder(client_, "square_and_reduce");
+  XlaBuilder builder("square_and_reduce");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   auto set_means = builder.ConstantR1<float>({2.f, 4.2f});
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.Reduce(
-      dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
+  builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
 
   std::vector<float> expected = {18, 0.06};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
-  ComputationBuilder builder(client_, "variance_to_stddev");
+  XlaBuilder builder("variance_to_stddev");
   auto variance = builder.ConstantR1<float>({6.f, .02f});
-  auto sqrt = builder.SqrtF32(variance);
+  builder.SqrtF32(variance);
 
   std::vector<float> expected = {2.44948974f, 0.14142136f};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
@@ -173,13 +170,13 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
 // Compare against a forward batch normalization example in the NN spec
 // reference.
 XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
-  ComputationBuilder builder(client_, "batch_normalize_per_spec");
+  XlaBuilder builder("batch_normalize_per_spec");
   auto input_activations =
       CheckShape(&builder, builder.ConstantLiteral(input_literal_),
                  ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
   auto gamma = builder.ConstantR1<float>({1.0, 1.0});
   auto beta = builder.ConstantR1<float>({0.0, 0.0});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all dimensions except dimension 1.
   Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2});
   auto sum = CheckShape(
@@ -189,8 +186,8 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
       TwoElementVectorF32);
   auto input_shape = builder.GetShape(input_activations).ConsumeValueOrDie();
   auto sum_shape = builder.GetShape(sum).ConsumeValueOrDie();
-  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(*input_shape) /
-                                         ShapeUtil::ElementsIn(*sum_shape));
+  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(input_shape) /
+                                         ShapeUtil::ElementsIn(sum_shape));
   auto set_means = builder.Div(sum, count);
 
   const float kEpsilon = 1e-9f;
@@ -233,7 +230,7 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
   const int kFeatureIndex = 3;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
@@ -242,8 +239,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
@@ -257,7 +254,7 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
@@ -266,8 +263,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
@@ -282,23 +279,23 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
   // Use 0 dimension as feature, tests layout analyzer.
   const int kFeatureIndex = 0;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/1, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/1, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
@@ -314,24 +311,24 @@ XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
 XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
   // Test the correctness of choosing a large epsilon value.
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
   // var = 125, mean = 15, epsilon = -100
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/-100, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/-100, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
@@ -346,7 +343,7 @@ XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
 
 XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand =
       builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
@@ -453,7 +450,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormTest_Instantiation, BatchNormTestManySizes,
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -553,7 +550,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -661,7 +658,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -828,9 +825,9 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   std::unique_ptr<GlobalData> grad_output_data =
       client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
 
-  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
-                                 mean_parameter, var_parameter,
-                                 grad_output_parameter, epsilon, feature_index);
+  builder.BatchNormGrad(input_parameter, scale_parameter, mean_parameter,
+                        var_parameter, grad_output_parameter, epsilon,
+                        feature_index);
 
   auto expected =
       Literal::MakeTuple({expected_grad_activation.get(),
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 0d94d65c1015fb54ada3fdfc95d0c31d0a0f158b..777ac167a3c38c38791e12541a5db3078c37595b 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -42,7 +42,7 @@ class BitcastConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.BitcastConvertType(a, S32);
 
@@ -51,7 +51,7 @@ TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.BitcastConvertType(a, F32);
 
@@ -60,7 +60,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
                                  static_cast<int32>(0xBF800000), 0x3F000000,
@@ -72,7 +72,7 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
 }
 
 XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.BitcastConvertType(a, F32);
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.BitcastConvertType(a, S32);
 
@@ -90,7 +90,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.BitcastConvertType(a, F32);
@@ -100,7 +100,7 @@ TEST_F(BitcastConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->BitcastConvertType(param, S32);
@@ -112,7 +112,7 @@ TEST_F(BitcastConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->BitcastConvertType(param, F32);
@@ -129,7 +129,7 @@ TEST_F(BitcastConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(BitcastConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({0x42280000});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.BitcastConvertType(reshape, F32);
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 17c6a83c1a3153f78da7f5f6c9b76542bc564203..312d8f284d3421b4ef06b94c12949fc5fe4fa0b0 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +36,10 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 namespace {
+
+// Name of the interpreter backend.
+constexpr char kInterpreter[] = "interpreter";
+
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
 Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
@@ -43,6 +48,14 @@ Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
 }
+
+// Helper functions to get the reference platform.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
@@ -66,6 +79,11 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   client_ = GetOrCreateLocalClientOrDie(default_options);
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
 }
@@ -74,9 +92,9 @@ string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+template <typename BuilderT>
 StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   // Build the computation, as a convenience.
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   return client_->Execute(computation, arguments, &execution_options_);
@@ -127,6 +145,20 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>>
+ClientLibraryTestBase::ExecuteAndTransferReference(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  return ref_client_->ExecuteAndTransfer(computation, arguments,
+                                         &execution_options);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -521,6 +553,69 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*reference, *result);
+}
+
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments,
+    ErrorSpec error) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectNear(*reference, *result, error);
+}
+
+StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+ClientLibraryTestBase::ComputeValueAndReference(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  // Transfer the arguments to the executor service. We put the unique_ptr's
+  // into a vector to keep the data alive on the service until the end of this
+  // function.
+  std::vector<std::unique_ptr<GlobalData>> argument_data;
+  std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+  for (const auto& arg : arguments) {
+    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
+    TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
+    argument_data.push_back(std::move(data));
+    ref_argument_data.push_back(std::move(ref_data));
+  }
+
+  // Create raw pointers to the GlobalData for the rest of the call stack.
+  std::vector<GlobalData*> argument_data_ptr;
+  std::transform(
+      argument_data.begin(), argument_data.end(),
+      std::back_inserter(argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+  std::vector<GlobalData*> ref_argument_data_ptr;
+  std::transform(
+      ref_argument_data.begin(), ref_argument_data.end(),
+      std::back_inserter(ref_argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      ExecuteAndTransfer(computation, argument_data_ptr));
+
+  TF_ASSIGN_OR_RETURN(auto reference, ExecuteAndTransferReference(
+                                          computation, ref_argument_data_ptr));
+
+  return std::make_pair(std::move(reference), std::move(result));
+}
+
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
@@ -595,6 +690,14 @@ ComputationDataHandle ClientLibraryTestBase::AddParam(
   return data_handle;
 }
 
+XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
+                                      XlaBuilder* builder) {
+  XlaOp data_handle;
+  arguments_.push_back(CreateParameterAndTransferLiteral(
+      arguments_.size(), argument, "", builder, &data_handle));
+  return data_handle;
+}
+
 ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
     const Literal& literal, ComputationBuilder* builder) {
   return builder->ConstantLiteral(
@@ -643,4 +746,11 @@ template void ClientLibraryTestBase::ComputeAndCompareTuple(
     XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    ComputationBuilder* builder,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 52f31b06698a424929df0ea1425ca66b5ac96a18..b3212dd2282375367ce890e960278fc469a5ef52 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -92,9 +92,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Convenience methods for building and running a computation with the member
   // execution options. Modify execution_options_ in your test if you want to
   // customize the options.
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<GlobalData>> Execute(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
   // the migration to XlaBuilder is complete.
@@ -114,6 +114,14 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
+  // This executes the computation via the reference client (which connects a
+  // interpreter backend). The result is used as the expected values of the
+  // computation.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransferReference(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
       ComputationBuilder* builder,
@@ -236,6 +244,14 @@ class ClientLibraryTestBase : public ::testing::Test {
                          tensorflow::gtl::ArraySlice<Literal> arguments,
                          ErrorSpec error);
 
+  // Convenience method for running a built computation and comparing the result
+  // with the reference result.
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments);
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments,
+                         ErrorSpec error);
+
   // Create scalar operations for use in reductions.
   Computation CreateScalarRelu();
   Computation CreateScalarMax();
@@ -300,12 +316,17 @@ class ClientLibraryTestBase : public ::testing::Test {
   // set exactly once. The first added parameter gets index 0, then 1 and so on.
   ComputationDataHandle AddParam(const Literal& argument,
                                  ComputationBuilder* builder);
+  XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
 
   template <class T>
   ComputationDataHandle AddParam(const Array<T>& argument,
                                  ComputationBuilder* builder) {
     return AddParam(*Literal::CreateFromArray(argument), builder);
   }
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(*Literal::CreateFromArray(argument), builder);
+  }
 
   // Creates a constant instruction with the given literal. When the
   // use_bfloat16 flag is set but the literal has F32 elements, the elements
@@ -408,6 +429,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
 
   Client* client_;
+  Client* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
@@ -439,12 +461,19 @@ class ClientLibraryTestBase : public ::testing::Test {
       const Shape* output_with_layout = nullptr);
 
   // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literal in the order of (expected, actual).
+  // the HloEvaluator. Returns two literals in the order of (expected, actual).
   StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
   ComputeValueAndReference(ComputationBuilder* builder,
                            const ComputationDataHandle& operand,
                            tensorflow::gtl::ArraySlice<Literal> arguments);
 
+  // Executes the computation and calculates the expected reference value using
+  // the reference client. Returns two literals in the order of (expected,
+  // actual).
+  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+  ComputeValueAndReference(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<Literal> arguments);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index e5a03b49ad259a64b9cbbc88c31d8c6558289d1b..c15d808f1ddfb44a512fa395bb8e515bca3859b6 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -71,28 +74,35 @@ class ComputeConstantTest : public ::testing::Test {
   }
 
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder, Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(auto computed, builder->ComputeConstant(
-                                           operand, output_layout, parameters));
+      Client* client, const XlaOp& operand, XlaBuilder* builder,
+      Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph, builder->BuildConstantSubGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
     return std::move(computed);
   }
 
+  template <class Scalar>
+  StatusOr<Scalar> ComputeConstantScalar(Client* client, const XlaOp& operand,
+                                         XlaBuilder* builder) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(client, operand,
+                                                             builder, nullptr));
+    return literal->Get<Scalar>({});
+  }
+
   template <class Scalar>
   StatusOr<Scalar> ComputeConstantScalar(
       Client* client, const ComputationDataHandle& operand,
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(
-        auto literal,
-        ComputeConstantLiteral(client, operand, builder, nullptr, parameters));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        builder->ComputeConstant(
+                            operand, /*output_layout=*/nullptr, parameters));
     return literal->Get<Scalar>({});
   }
 
-  bool IsConstant(const ComputationDataHandle& operand,
-                  ComputationBuilder* builder, int64 num_parameters = 0) {
-    StatusOr<bool> result = builder->IsConstant(operand, num_parameters);
+  bool IsConstant(const XlaOp& operand, XlaBuilder* builder) {
+    StatusOr<bool> result = builder->IsConstant(operand);
     EXPECT_TRUE(result.ok()) << result.status();
     return result.ok() ? result.ValueOrDie() : false;
   }
@@ -103,7 +113,7 @@ class ComputeConstantTest : public ::testing::Test {
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.ConstantR0<int32>(42);
     EXPECT_TRUE(IsConstant(computation, &b));
 
@@ -116,7 +126,7 @@ TEST_F(ComputeConstantTest, ScalarInt32Literal) {
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
     EXPECT_TRUE(IsConstant(computation, &b));
@@ -130,7 +140,7 @@ TEST_F(ComputeConstantTest, ScalarFloatAdd) {
 TEST_F(ComputeConstantTest, ScalarRng) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
                      ShapeUtil::MakeShape(F32, {}));
@@ -151,19 +161,21 @@ TEST_F(ComputeConstantTest, Param) {
 
     std::vector<Literal> arguments;
     arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
-    EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
-
-    auto value =
-        ComputeConstantScalar<float>(client, computation, &b, arguments);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant,
+                            b.IsConstant(computation, arguments.size()));
+    EXPECT_TRUE(is_constant);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value,
+        ComputeConstantScalar<float>(client, computation, &b, arguments));
+    EXPECT_EQ(value, 44.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
     EXPECT_FALSE(IsConstant(computation, &b));
 
@@ -177,7 +189,7 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
 TEST_F(ComputeConstantTest, IndirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(1.0f),
               b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
@@ -195,7 +207,7 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
 TEST_F(ComputeConstantTest, UnrelatedParam) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
     auto constant_4 =
@@ -212,64 +224,64 @@ TEST_F(ComputeConstantTest, UnrelatedParam) {
 
     EXPECT_TRUE(IsConstant(constant_13, &b));
 
-    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value, ComputeConstantScalar<float>(client, constant_13, &b));
+    EXPECT_EQ(value, 13.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto computation =
         b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 XLA_TEST_F(ComputeConstantTest, Layout) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
     for (const std::vector<int64>& layout : layouts) {
       auto layout_proto = LayoutUtil::MakeLayout(layout);
-      auto computed = ComputeConstantLiteral(
-          client,
-          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-          &b, &layout_proto);
-      ASSERT_TRUE(computed.ok()) << computed.status();
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, ComputeConstantLiteral(
+                             client,
+                             b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                                   b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+                             &b, &layout_proto));
 
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index b917dee77b5400db8f2c0a6a86258fee64723d71..7ff6706935740c7d76ee5cd03eae292386760397 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -23,8 +24,8 @@ namespace {
 
 class ConditionalOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0ConstantComputation(float value) {
-    ComputationBuilder builder(client_, "Constant");
+  XlaComputation CreateR0ConstantComputation(float value) {
+    XlaBuilder builder("Constant");
     builder.Parameter(0, empty_tuple_, "tuple");
     builder.ConstantR0<float>(value);
     auto build_status = builder.Build();
@@ -32,16 +33,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateCeilComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Ceil");
+  XlaComputation CreateCeilComputation(const Shape& shape) {
+    XlaBuilder builder("Ceil");
     auto param = builder.Parameter(0, shape, "param");
     builder.Ceil(param);
     auto build_status = builder.Build();
@@ -49,16 +50,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0CeilComputation() {
+  XlaComputation CreateR0CeilComputation() {
     return CreateCeilComputation(r0f32_);
   }
 
-  Computation CreateR1CeilComputation() {
+  XlaComputation CreateR1CeilComputation() {
     return CreateCeilComputation(r1s2f32_);
   }
 
-  Computation CreateFloorComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Floor");
+  XlaComputation CreateFloorComputation(const Shape& shape) {
+    XlaBuilder builder("Floor");
     auto param = builder.Parameter(0, shape, "param");
     builder.Floor(param);
     auto build_status = builder.Build();
@@ -66,17 +67,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0FloorComputation() {
+  XlaComputation CreateR0FloorComputation() {
     return CreateFloorComputation(r0f32_);
   }
 
-  Computation CreateR1FloorComputation() {
+  XlaComputation CreateR1FloorComputation() {
     return CreateFloorComputation(r1s2f32_);
   }
 
-  Computation CreateTupleCeilComputation(const string& computation_name,
-                                         const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleCeilComputation(const string& computation_name,
+                                            const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -88,17 +89,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleCeilComputation() {
+  XlaComputation CreateR0TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleCeilComputation() {
+  XlaComputation CreateR1TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleFloorComputation(const string& computation_name,
-                                          const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleFloorComputation(const string& computation_name,
+                                             const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -110,17 +111,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleFloorComputation() {
+  XlaComputation CreateR0TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleFloorComputation() {
+  XlaComputation CreateR1TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleAddComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleAddComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -130,17 +131,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleAddComputation() {
+  XlaComputation CreateR0TupleAddComputation() {
     return CreateTupleAddComputation("AddR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleAddComputation() {
+  XlaComputation CreateR1TupleAddComputation() {
     return CreateTupleAddComputation("AddR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleSubComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleSubComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -150,11 +151,11 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleSubComputation() {
+  XlaComputation CreateR0TupleSubComputation() {
     return CreateTupleSubComputation("SubR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleSubComputation() {
+  XlaComputation CreateR1TupleSubComputation() {
     return CreateTupleSubComputation("SubR1", tuple_2_r1s2f32_);
   }
 
@@ -170,26 +171,25 @@ class ConditionalOpTest : public ClientLibraryTestBase {
 
 // Test true and false computations that do not take any parameters.
 XLA_TEST_F(ConditionalOpTest, Parameters0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   auto true_computation = CreateR0ConstantComputation(56.0f);
   auto false_computation = CreateR0ConstantComputation(12.0f);
-  auto result = builder.Conditional(pred, operands, true_computation, operands,
-                                    false_computation);
+  builder.Conditional(pred, operands, true_computation, operands,
+                      false_computation);
 
   ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
 }
 
 // Test true and false computations that take in 1 parameter.
 XLA_TEST_F(ConditionalOpTest, Parameters1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto identity = CreateR0IdentityComputation();
-  auto result =
-      builder.Conditional(pred, operand1, identity, operand2, identity);
+  builder.Conditional(pred, operand1, identity, operand2, identity);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -197,12 +197,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -210,11 +210,11 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
 // Test conditional with two different computations in the true and false cases
 // that take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand, CreateR0CeilComputation(),
-                                    operand, CreateR0FloorComputation());
+  builder.Conditional(pred, operand, CreateR0CeilComputation(), operand,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -222,12 +222,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
 // Test conditional with the same computation in the true and false cases but
 // take in different arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+  builder.Conditional(pred, operand1, floor, operand2, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -235,11 +235,11 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
 // Test conditional with the same computation in the true and false cases that
 // take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+  builder.Conditional(pred, operand, floor, operand, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -247,12 +247,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
 // Test conditional with different instances of the same computation in the true
 // and false cases.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0FloorComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0FloorComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -260,7 +260,7 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
 // Test the case when a call invokes a computation that contains a conditional.
 XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
   auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
   auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
@@ -268,7 +268,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
                             false_operand, CreateR0FloorComputation());
   auto inner_builder_result = inner_builder.Build();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
@@ -281,14 +281,13 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
 // Test true and false computations that take in 2 parameters and predicate is
 // true.
 XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
 }
@@ -296,14 +295,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
 // Test true and false computations that take in 2 parameters and predicate is
 // false.
 XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
 }
@@ -311,14 +309,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is true.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
 }
@@ -326,21 +323,20 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
 }
 
 // Test true and false computations that return a tuple of scalars.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple(
       {builder.ConstantR0<float>(12.2f), builder.ConstantR0<float>(25.6f)});
@@ -356,7 +352,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 
 // Test true and false computations that return a tuple of arrays.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
                                  builder.ConstantR1<float>({25.6f, 29.2f})});
@@ -373,7 +369,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_pred = true_builder.ConstantR0<bool>(true);
@@ -384,7 +380,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_pred = false_builder.ConstantR0<bool>(false);
@@ -395,7 +391,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -411,7 +407,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
 
 // Test true and false computations that return a nested tuple.
 XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_constant1 = true_builder.ConstantR0<float>(12.2f);
@@ -424,7 +420,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_constant1 = false_builder.ConstantR0<float>(46.6f);
@@ -438,7 +434,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -460,16 +456,16 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
 // params.
 XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
   auto operand1_param =
       CreateR0Parameter<float>(56.3f, 1, "operand1", &builder, &operand1);
   auto operand2_param =
       CreateR0Parameter<float>(12.7f, 2, "operand2", &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(
       &builder, 57.0f,
@@ -480,16 +476,16 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
 // Test conditional that takes in array operands in the form of external params.
 XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
   auto operand1_param = CreateR1Parameter<float>({24.3f, 56.7f}, 1, "operand1",
                                                  &builder, &operand1);
   auto operand2_param = CreateR1Parameter<float>({10.2f, 11.6f}, 2, "operand2",
                                                  &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR1CeilComputation(),
-                                    operand2, CreateR1FloorComputation());
+  builder.Conditional(pred, operand1, CreateR1CeilComputation(), operand2,
+                      CreateR1FloorComputation());
 
   ComputeAndCompareR1<float>(
       &builder, {10.0f, 11.0f},
@@ -499,7 +495,7 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
 
 // Test the case where one conditional is nested within another.
 XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -514,7 +510,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred1 = builder.ConstantR0<bool>(true);
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
@@ -529,7 +525,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
 }
 
 XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -544,7 +540,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
   auto operand2 = builder.ConstantR0<float>(12.2f);
@@ -556,7 +552,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
 
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
@@ -573,27 +569,27 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
 
 XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
-  Computation swapper;
+  XlaComputation swapper;
   {
-    ComputationBuilder builder(client_, TestName() + ".swapper");
+    XlaBuilder builder(TestName() + ".swapper");
     auto param0 = builder.Parameter(0, tuple_shape, "sp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({y, x});
     swapper = builder.Build().ConsumeValueOrDie();
   }
-  Computation forwarder;
+  XlaComputation forwarder;
   {
-    ComputationBuilder builder(client_, TestName() + ".forwarder");
+    XlaBuilder builder(TestName() + ".forwarder");
     auto param0 = builder.Parameter(0, tuple_shape, "fp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({x, y});
     forwarder = builder.Build().ConsumeValueOrDie();
   }
-  Computation main;
+  XlaComputation main;
   {
-    ComputationBuilder builder(client_, TestName() + ".main");
+    XlaBuilder builder(TestName() + ".main");
     auto param0 = builder.Parameter(0, tuple_shape, "mp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
@@ -605,7 +601,7 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   }
 
   auto test_swap = [&](float a, float b) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto x = builder.ConstantR0<float>(a);
     auto y = builder.ConstantR0<float>(b);
     auto tuple_operand = builder.Tuple({x, y});
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 72715398dea468d0000144759454c5f8d8673516..947959beb144e1509a77ad2f94b8493de46ba6f2 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -88,12 +88,12 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     ASSERT_EQ(2, arhs->width());
     ASSERT_EQ(2, arhs->height());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto lhs = builder.ConstantR4FromArray4D<T>(*alhs);
     auto rhs = builder.ConstantR4FromArray4D<T>(*arhs);
-    auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-    ComputeAndCompare(&builder, conv, {}, error_spec_);
+    ComputeAndCompare(&builder, {}, error_spec_);
   }
 };
 
@@ -106,12 +106,12 @@ template <typename T>
 class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 1, 2);
     input_data.FillWithYX(Array2D<T>({
@@ -122,7 +122,7 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -137,12 +137,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -156,7 +156,7 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
         {7.0f, 8.0f},
     }));
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -171,12 +171,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -191,7 +191,7 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
         {7.0f, 8.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -207,12 +207,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 3, 3});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f},
@@ -223,7 +223,7 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
     filter_data.FillWithYX(Array2D<T>(
         {{5.0f, 6.0f, 7.0f}, {8.0f, 9.0f, 10.0f}, {11.0f, 12.0f, 13.0f}}));
     // clang-format on
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -234,7 +234,7 @@ TYPED_TEST_CASE(Convolve_1x1x4x4_1x1x3x3_Same, TestTypes);
 TYPED_TEST(Convolve_1x1x4x4_1x1x3x3_Same, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -264,7 +264,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -300,7 +300,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -331,7 +331,7 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -365,7 +365,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -402,7 +402,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
   std::vector<int64> filter_dims = {2, 2, 2, 3, 3};
   Shape input_shape = ShapeUtil::MakeShape(F32, input_dims);
@@ -469,7 +469,7 @@ template <typename T>
 class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<int64> input_dims = {1, 3, 3, 5};
     std::vector<int64> filter_dims = {3, 3, 5, 3};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
@@ -537,7 +537,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
     execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
         "convolution-canonicalization");
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
   Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
 
@@ -551,8 +551,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   dnums.set_kernel_output_feature_dimension(1);
   dnums.set_output_batch_dimension(0);
   dnums.set_output_feature_dimension(1);
-  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
-                                                Padding::kValid, dnums);
+  builder.ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums);
 
   Array2D<float> param0(4, 29);
   param0.FillUnique();
@@ -563,7 +562,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   Array2D<float> expected_result(29, 10);
   expected_result.Fill(0);
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(param0)),
                      std::move(*Literal::CreateFromArray(param1))},
                     error_spec_);
@@ -587,7 +586,7 @@ class Convolve1D1WindowTestBase
  protected:
   template <typename T>
   void TestImpl() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     int64 input_feature = GetParam().input_feature;
     int64 output_feature = GetParam().output_feature;
     int64 batch = GetParam().batch;
@@ -724,12 +723,12 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<bfloat16> input_data(1, 1, 1, 2);
   input_data.FillWithYX(Array2D<bfloat16>({
@@ -740,11 +739,34 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
       {bfloat16(5), bfloat16(6)},
   }));
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(input_data)),
                      std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
+// Check that GPU convs still work if the CudnnAlgorithmPicker pass is disabled.
+// (We run this test on all platforms, because, what the heck.)
+XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) {
+  execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+      "cudnn-convolution-algorithm-picker");
+
+  XlaBuilder builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<float> input_data(1, 1, 1, 2);
+  input_data.FillIota(0);
+  Array4D<float> filter_data(1, 1, 1, 2);
+  filter_data.FillIota(10);
+
+  ComputeAndCompare(&builder,
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 2d847a66b0ae7c8f09fa0cb181a4c84ea99be5b1..b43d5c9ff5d75ee0e1b3c9ceb2bc295e631ac107 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -134,9 +134,9 @@ class CustomCallClientAPITest : public ClientLibraryTestBase {};
 // When using the client API, CustomCall targets can't begin with '$' -- these
 // are reserved for internal use.
 XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
-  ComputationBuilder builder(client_, TestName());
-  auto call = builder.CustomCall("$illegal", /*operands=*/{},
-                                 ShapeUtil::MakeShape(F32, {1}));
+  XlaBuilder builder(TestName());
+  builder.CustomCall("$illegal", /*operands=*/{},
+                     ShapeUtil::MakeShape(F32, {1}));
 
   StatusOr<std::unique_ptr<GlobalData>> result =
       Execute(&builder, /*arguments=*/{});
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 7b994a4c172cafee53ede9bfd4f30b0e0c9888d5..c4031dfee593a13af6a5db15e43ed7bc418603c5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -50,6 +50,13 @@ using TypesF16F32 = ::testing::Types<Eigen::half, float>;
 using TypesF16F32F64 = ::testing::Types<Eigen::half, float, double>;
 using TypesF16F32F64CF64 =
     ::testing::Types<Eigen::half, float, double, complex64>;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
+using TypesF16F32 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64 = ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64CF64 =
+    ::testing::Types<Eigen::half, float>;
 #else
 #error "Situation not handled yet"
 #endif
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 9db68ff7a6dcbd9204fb2b3a37734a9aaed35dfd..90496d55e60b4f45fc2d46b2746f94d775cf9f94 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -405,7 +405,7 @@ class GatherClientLibraryTest : public ClientLibraryTestBase {};
 // GPU and CPU_PARALLEL.
 XLA_TEST_F(GatherClientLibraryTest,
            DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) {
-  // We create this HLO, but using the ComputationBuilder API.
+  // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
   //   operand = s32[3,3] parameter(0)
@@ -418,7 +418,7 @@ XLA_TEST_F(GatherClientLibraryTest,
   //       window_bounds={1, 3}
   // }
 
-  ComputationBuilder builder(client_, "gather_basic");
+  XlaBuilder builder("gather_basic");
 
   Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
   Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
@@ -443,8 +443,8 @@ XLA_TEST_F(GatherClientLibraryTest,
                           client_->GetDeviceHandles(1));
   xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions();
   *execution_options.add_device_handles() = devices[0];
-  TF_ASSERT_OK_AND_ASSIGN(Computation computation, builder.Build());
-  std::vector<xla::Client::ComputationInstance> computation_instances = {
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  std::vector<xla::Client::XlaComputationInstance> computation_instances = {
       {computation,
        {operand_arg.get(), indices_arg.get()},
        execution_options,
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index e574644dea7c1ba144ba87fbeb7f28cc52312e26..21f71fc91bb84540e5347811cb4643a8aeda445c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -91,7 +91,7 @@ HloTestBase::HloTestBase()
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ = MakeUnique<HloVerifier>();
+  hlo_verifier_ = MakeUnique<HloVerifier>(/*allow_mixed_precision=*/true);
 }
 
 /* static */
@@ -142,8 +142,7 @@ StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
           "reference preprocessor must not modify the program shape");
     }
   }
-  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
-                                     reference_module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(reference_module.get()).status());
   return std::move(reference_module);
 }
 
@@ -151,8 +150,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
     std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error, bool run_hlo_passes,
     const std::function<void(HloModule*)>& reference_preprocessor) {
-  TF_RETURN_IF_ERROR(
-      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(module.get()).status());
   TF_ASSIGN_OR_RETURN(auto reference_module,
                       MakeReferenceModule(*module, reference_preprocessor));
 
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 0cd812fd1b4bc69c34b70d3ca0fd0aa6cf57fa4c..efe6cc67872713a8aeecc11aeafe4902676817a6 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,18 +52,18 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOne() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOne() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
-    auto adder_to_one = mapped_builder.Add(x, one);
+    mapped_builder.Add(x, one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
   }
 
-  Computation CreateMax() {
-    ComputationBuilder b(client_, TestName());
+  XlaComputation CreateMax() {
+    XlaBuilder b(TestName());
     auto lhs = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto rhs = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     b.Max(lhs, rhs);
@@ -73,8 +75,8 @@ class MapTest : public ClientLibraryTestBase {
   // Creates a computation that accepts an F32 and returns T(1) (ignoring the
   // argument).
   template <class T>
-  Computation CreateScalarOne() {
-    ComputationBuilder mapped_builder(client_, "scalar_one");
+  XlaComputation CreateScalarOne() {
+    XlaBuilder mapped_builder("scalar_one");
     (void)mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     mapped_builder.ConstantR0<T>(1);
     auto computation_status = mapped_builder.Build();
@@ -87,11 +89,11 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (mul)
   //                /
   // 2.0f ---------/
-  Computation CreateMulByTwo() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateMulByTwo() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto two = mapped_builder.ConstantR0<float>(2.0);
-    auto mul_by_two = mapped_builder.Mul(x, two);
+    mapped_builder.Mul(x, two);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -105,12 +107,12 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add) ----> (mul)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOneTimesItself() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOneTimesItself() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
     auto adder_to_one = mapped_builder.Add(x, one);
-    auto result = mapped_builder.Mul(x, adder_to_one);
+    mapped_builder.Mul(x, adder_to_one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -122,12 +124,13 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} -----------> (map) ----> (add)
   //                         /           /
   // embedded_computation --/       n --/
-  Computation CreateMapPlusN(const Computation& embedded_computation, float n) {
-    ComputationBuilder builder(client_, TestName());
+  XlaComputation CreateMapPlusN(const XlaComputation& embedded_computation,
+                                float n) {
+    XlaBuilder builder(TestName());
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto map = builder.Map({x}, embedded_computation, {});
     auto constant_n = builder.ConstantR0<float>(n);
-    auto add = builder.Add(map, constant_n);
+    builder.Add(map, constant_n);
     auto computation_status = builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -135,11 +138,11 @@ class MapTest : public ClientLibraryTestBase {
 
   // Creates a binary function with signature (F32, F32) -> Pred
   // defined by (x, y) -> x > y.
-  Computation CreateGt() {
-    ComputationBuilder b(client_, "Gt");
+  XlaComputation CreateGt() {
+    XlaBuilder b("Gt");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
-    auto gt = b.Gt(x, y);
+    b.Gt(x, y);
     auto computation_status = b.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -152,13 +155,13 @@ class MapTest : public ClientLibraryTestBase {
   // y {R0F32} ----> (add) ---> (add)
   //                           /
   // z {R0F32} ---------------/
-  Computation CreateTernaryAdder() {
-    ComputationBuilder mapped_builder(client_, "TernaryAdder");
+  XlaComputation CreateTernaryAdder() {
+    XlaBuilder mapped_builder("TernaryAdder");
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = mapped_builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     auto z = mapped_builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "z");
     auto xy = mapped_builder.Add(x, y);
-    auto xyz = mapped_builder.Add(xy, z);
+    mapped_builder.Add(xy, z);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -167,13 +170,13 @@ class MapTest : public ClientLibraryTestBase {
 
 TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(42.0);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {});
+  builder.Map({param}, CreateAdderToOne(), {});
 
   ComputeAndCompareR0<float>(&builder, 43.0, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -181,13 +184,13 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
 
 XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -195,55 +198,55 @@ XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
 
 TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {3.2f, 4.3f, 5.4f, 6.5f},
                              {param0_data.get()}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<int32>(), {0});
+  builder.Map({param}, CreateScalarOne<int32>(), {0});
 
   ComputeAndCompareR1<int32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<uint32>(), {0});
+  builder.Map({param}, CreateScalarOne<uint32>(), {0});
 
   ComputeAndCompareR1<uint32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachElemLongerChainR1) {
   // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOneTimesItself(), {0});
+  builder.Map({param}, CreateAdderToOneTimesItself(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f},
@@ -253,14 +256,14 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
 XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -269,7 +272,7 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
 TEST_F(MapTest, MapMultipleMapsR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -277,7 +280,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {6.4f, 8.6f, 10.8f, 13.0f},
                              {param0_data.get()}, ErrorSpec(0.01f));
@@ -285,14 +288,14 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
 TEST_F(MapTest, MapEachElemPlusOneR2) {
   // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0, 1});
+  builder.Map({param}, CreateAdderToOne(), {0, 1});
 
   Array2D<float> expected_array(
       {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}});
@@ -317,18 +320,18 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   auto embed2 = CreateMapPlusN(embed1, 2.0);
   auto embed3 = CreateMapPlusN(embed1, 4.0);
 
-  ComputationBuilder embed4_builder(client_, "embed4");
+  XlaBuilder embed4_builder("embed4");
   auto embed4_param = embed4_builder.Parameter(0, scalar_shape, "x");
   auto embed4_map_lhs = embed4_builder.Map({embed4_param}, embed2, {});
   auto embed4_map_rhs = embed4_builder.Map({embed4_param}, embed3, {});
-  auto embed4_add = embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
+  embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
   auto embed4_status = embed4_builder.Build();
   ASSERT_IS_OK(embed4_status.status());
   auto embed4 = embed4_status.ConsumeValueOrDie();
 
   auto embed5 = CreateMapPlusN(embed2, 6.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant_42 = builder.ConstantR0<float>(42.0);
   auto constant_7 = builder.ConstantR0<float>(7.0);
   auto map_42 = builder.Map({constant_42}, embed5, {});
@@ -359,7 +362,8 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
   // Add another Add(1) operation to the existing embedded computation. This
   // requires using the stub interface because the ComputationBuilder does not
-  // allow modification to the Computation objects after they have been built.
+  // allow modification to the XlaComputation objects after they have been
+  // built.
   BinaryOpRequest request;
   request.set_binop(BINOP_ADD);
   *request.mutable_lhs() = adder_to_one;
@@ -381,7 +385,7 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
 TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -393,8 +397,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(F32, &builder), {0});
+  builder.Map({param0, param1}, CreateScalarAddComputation(F32, &builder), {0});
 
   ComputeAndCompareR1<float>(&builder, {7.3f, 7.7, 4.3f, 0},
                              {param0_data.get(), param1_data.get()},
@@ -404,7 +407,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 // Adds two rank-2 arrays with different layouts. This test exercises a path
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
   std::unique_ptr<GlobalData> param0_data =
@@ -417,8 +420,8 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1});
 
   Array2D<int32> expected(2, 2);
   expected(0, 0) = 11;
@@ -430,7 +433,7 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 }
 
 XLA_TEST_F(MapTest, AddR3_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param0_data =
@@ -443,8 +446,8 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1, 2});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1, 2});
 
   ComputeAndCompareR3<int32>(&builder, Array3D<int32>(3, 0, 2),
                              {param0_data.get(), param1_data.get()});
@@ -452,7 +455,7 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
 TEST_F(MapTest, MapTernaryAdder) {
   // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -469,7 +472,7 @@ TEST_F(MapTest, MapTernaryAdder) {
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
   auto param2 = builder.Parameter(2, param2_literal->shape(), "param2");
-  auto map = builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
+  builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {-2.7f, -92.3f, -895.7f, -400.0f},
@@ -479,24 +482,24 @@ TEST_F(MapTest, MapTernaryAdder) {
 
 TEST_F(MapTest, MapGt) {
   // Maps (x,y) -> x > y onto two R1F32 vectors.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto gt = CreateGt();
   b.Map({b.ConstantR1<float>({1, 20}), b.ConstantR1<float>({10, 2})}, gt, {0});
   ComputeAndCompareR1<bool>(&b, {false, true}, {});
 }
 
 TEST_F(MapTest, NestedBinaryMap) {
-  Computation max_with_square;
+  XlaComputation max_with_square;
   {
     // max_with_square(x) = do max(x, x^2) via a map.
-    ComputationBuilder b(client_, "max_with_square");
+    XlaBuilder b("max_with_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     b.Map({x, b.Mul(x, x)}, CreateMax(), {});
     auto computation_status = b.Build();
     ASSERT_IS_OK(computation_status.status());
     max_with_square = computation_status.ConsumeValueOrDie();
   }
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({0.1f, 0.5f, -0.5f, 1.0f, 2.0f});
   b.Map({input}, max_with_square, {0});
   ComputeAndCompareR1<float>(&b, {0.1f, 0.5f, 0.25f, 1.0f, 4.0f}, {});
@@ -505,13 +508,13 @@ TEST_F(MapTest, NestedBinaryMap) {
 TEST_F(MapTest, MapOperantionWithBuildError) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors but uses an unsupported
   // type combination (F32 + U16) to test that the error is reported to the
-  // outermost ComputationBuilder.
-  ComputationBuilder builder(client_, TestName());
+  // outermost XlaBuilder.
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("ErrorAdd");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(U16, {}), "y");
-  auto adder = sub_builder->Add(x, y);
+  sub_builder->Add(x, y);
   auto error_add = sub_builder->BuildAndNoteError();
 
   std::unique_ptr<Literal> param0_literal =
@@ -525,9 +528,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1}, error_add, {0});
+  builder.Map({param0, param1}, error_add, {0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -545,7 +548,7 @@ using MapTestWithFullOpt = ClientLibraryTestBase;
 // to have issues with such patterns and maybe invalidate the pointer to entry
 // computation.
 TEST_F(MapTestWithFullOpt, MapScalarPower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -572,7 +575,7 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
 // Regression test for b/35786417, where the inliner would not notice the change
 // of parameter order inside the map.
 TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -598,7 +601,7 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
 // Regression test for b/35786417, where the inliner would CHECK-fail due to the
 // mul inside the map having more parameters than the map does.
 TEST_F(MapTestWithFullOpt, MapSquare) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 8cef8dd34dc7b16b1e58ded67d6b6a4ba79f20db..ce295b832d79e4f00656f2893c2ba1162693dd73 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -85,7 +85,7 @@ class PadTestFloat : public PadTest,
 
 // Tests a Pad() with a zero-element input and output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -100,7 +100,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -115,7 +115,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -130,7 +130,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
         AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
@@ -138,7 +138,7 @@ XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
       {1.0f, 2.0f},  // row 0
@@ -162,7 +162,7 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -181,7 +181,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -223,7 +223,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -266,7 +266,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
 }
 
 XLA_TEST_F(PadTest, Pad4DU8Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<uint8>>(1, 1, 3, 2);
   Array2D<uint8> input_xy({
       {1, 2},  // row 0
@@ -290,7 +290,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
 }
 
 XLA_TEST_F(PadTest, Pad4DPredArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   // Since bool is currently not well supported, use Broadcast operation to
   // create the operand for Pad.
@@ -317,7 +317,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
 }
 
 XLA_TEST_P(PadTestFloat, Large2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   auto ones = MakeUnique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
@@ -329,15 +329,14 @@ XLA_TEST_P(PadTestFloat, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
@@ -352,15 +351,14 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, High2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -378,8 +376,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -387,7 +384,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativePadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -406,8 +403,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -415,7 +411,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 8;
   constexpr int64 in_cols = 11;
@@ -434,8 +430,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -444,20 +439,19 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
 // Regression test for b/31827337.
 XLA_TEST_P(PadTestFloat, ReducePad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
   auto input = AddParam(*ones, &b);
 
-  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  XlaComputation add = CreateScalarAddComputation(FloatType(), &b);
   auto reduce =
       b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index dc7ce3253cee255a7949326fa5b49fc8917432b8..b311785449f1774c3bc1e4d7ad35c2866e3b4061 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
@@ -228,15 +228,14 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   // This is required for proper handling of NaN values.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({input_values});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
   auto a = builder.Parameter(0, a_literal->shape(), "a");
 
-  auto reduce_precision =
-      builder.ReducePrecision(a, exponent_bits, mantissa_bits);
+  builder.ReducePrecision(a, exponent_bits, mantissa_bits);
 
   ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
 }
@@ -252,7 +251,7 @@ class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -265,7 +264,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
   // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
   // reduce-precision operation showed up in the correct place in the
   // graph.
-  auto log = builder.Log(abs);
+  builder.Log(abs);
 
   // Insert precision-reduction after the Abs(x) operation, rounding that
   // result to exactly 1.0f.
@@ -281,7 +280,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -290,7 +289,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kAbs operations.  This
   // should not see into the fusion nodes and thus should not affect the
@@ -307,7 +306,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -316,7 +315,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kFusion operations.
   auto reduce_precision_pass = execution_options_.mutable_debug_options()
@@ -331,7 +330,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -340,7 +339,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kCos operations.  This
   // should have no effect.
@@ -356,7 +355,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -365,7 +364,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kAbs operations.  This
   // should see the kAbs operation within the above fusion node.
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index d24927d22b6534b46e711cd442f19a3e5cfcebdf..768beec15e7ec3b8e7d2b4ed8a5aae62fac9dd7a 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -39,6 +39,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -502,21 +504,18 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
 // Test that algebraic simplifier does not incorrectly fold a transpose into a
 // reduction operation.
 XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
-  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
-  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
-  ComputationDataHandle transpose =
-      builder.Transpose(input, /*permutation=*/{1, 0, 2});
-  ComputationDataHandle reduce =
-      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+  XlaOp input = builder.Parameter(0, input_shape, "input");
+  XlaOp zero = builder.ConstantR0<float>(0.0);
+  XlaOp transpose = builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
                           MakeFakeLiteral(input_shape));
 
-  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
-                    ErrorSpec(0.01, 1e-4));
+  ComputeAndCompare(&builder, {std::move(*input_data)}, ErrorSpec(0.01, 1e-4));
 }
 
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 8dd24f1237136e2807cea8a261ead25f5c7adbb2..6a054a5dd39d326630958b1fce877e07fe56eb35 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -63,11 +64,9 @@ class ReduceWindowTestBase : public ClientLibraryTestBase {
 class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                          public ReduceWindowTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {
-    set_use_bfloat16(GetParam());
-  }
+  ReduceWindowTest() : builder_(TestName()) { set_use_bfloat16(GetParam()); }
 
-  void ReduceWindowAdd(const ComputationDataHandle& input,
+  void ReduceWindowAdd(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -78,16 +77,17 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(const ComputationDataHandle& input,
+  void ReduceWindowMax(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
-    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
-                          window_strides, padding);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMaxComputation(FloatType(), &builder_),
+                          window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(const ComputationDataHandle& input,
+  void ReduceWindowMin(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -97,7 +97,7 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
@@ -310,7 +310,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   auto rhs = b->Parameter(1, scalar, "rhs");
   b->Min(b->Add(lhs, rhs),
          CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
-  Computation reduce_fn = b->BuildAndNoteError();
+  XlaComputation reduce_fn = b->BuildAndNoteError();
 
   builder_.ReduceWindow(
       input,
@@ -338,7 +338,7 @@ TEST_P(ReduceWindowTest, R4UnitWindow) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -406,7 +406,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -428,7 +428,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -450,7 +450,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -551,7 +551,7 @@ TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
 
 TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input = builder_.Broadcast(
+  XlaOp input = builder_.Broadcast(
       CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
@@ -610,7 +610,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
   R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
@@ -621,7 +621,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
 
@@ -962,7 +962,7 @@ class R3ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R3ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
@@ -973,7 +973,7 @@ TEST_P(R3ReduceWindowTest, Add) {
       Literal::CreateR3FromArray3DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
   auto init_value =
@@ -1063,15 +1063,12 @@ struct R2ReduceWindowTestData {
      /*strides=*/{1, 1}, /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
-// TODO(b/76025683): These tests fail on TPU.
-#if defined(XLA_TEST_BACKEND_CPU) || defined(XLA_TEST_BACKEND_GPU)
-    {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
-     /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
-     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
      /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
-#endif
+    {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
+     /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 };
 
 string R2ReduceWindowTestDataToString(
@@ -1100,7 +1097,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
     CHECK(param.reducer == kAdd);
 
@@ -1110,7 +1107,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         Literal::CreateR2FromArray2DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
     std::vector<std::pair<int64, int64>> padding(2);
@@ -1298,7 +1295,7 @@ class R1ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R1ReduceWindowTest, DoIt) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
@@ -1307,7 +1304,7 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
 
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 02272d60171c70896f44b0d6b96f176ea52e686f..d7462d581b8596dc43b81b0162b3f5020cebb546 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -20,11 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -53,11 +52,11 @@ class ReshapeTest : public ::testing::WithParamInterface<bool>,
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
 XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -68,9 +67,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
@@ -81,9 +80,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
@@ -95,11 +94,11 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
 XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -112,15 +111,14 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   auto a = builder.Neg(parameter);
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+  builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
   auto expected_literal = Literal::CreateR1<float>({-1.0f});
   ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
@@ -131,10 +129,10 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(0, 3);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -147,11 +145,11 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -164,10 +162,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(3, 0);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -178,9 +176,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
 
 // Collapses a 2-dimensional row vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -191,9 +189,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
 
 // Collapses a 2-dimensional column vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial3x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -344,9 +342,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -359,10 +357,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -379,9 +377,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
 // with an incorrect result rank.
 //
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -394,10 +392,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -421,9 +419,9 @@ static Array3D<float> ArrayForDocR3Tests() {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -436,9 +434,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -456,9 +454,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -471,9 +469,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -491,9 +489,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -521,12 +519,12 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = Literal::CreateFromArray(t2x2x2x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
@@ -540,7 +538,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
 
 // As above, but uses reshape directly.
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
   t(0, 0, 0, 1) = 1;
@@ -551,7 +549,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
   auto input_literal = Literal::CreateFromArray(t);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -566,7 +564,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
 // Reshape various ranks to a scalar.
 XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -574,7 +572,7 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     std::vector<int64> zeros(rank, 0);  // this is {0, ..., 0}.
     input_literal.Set<float>(zeros, 83.0f);
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
                                                    &b, &parameter);
     b.Reshape(parameter, dimensions, {});
@@ -586,9 +584,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, BadDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {}, {});
@@ -598,9 +596,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
 }
 
 XLA_TEST_P(ReshapeTest, BadNewSizes) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {1}, {});
@@ -609,7 +607,7 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
 }
 
 XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
@@ -635,7 +633,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
 
@@ -646,7 +644,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
       {222, 333, 444, 555, 666, 777, 888, 999},
   });
 
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
@@ -664,13 +662,13 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
@@ -691,13 +689,13 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
@@ -717,7 +715,7 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -727,7 +725,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
@@ -739,7 +737,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -749,7 +747,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
@@ -762,7 +760,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
 XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -772,7 +770,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
@@ -789,7 +787,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
 }
 
 XLA_TEST_P(ReshapeTest, NoopReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -799,12 +797,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
@@ -827,12 +825,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -846,8 +844,8 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
@@ -880,8 +878,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -909,8 +907,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -938,8 +936,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -968,8 +966,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -997,8 +995,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 8fc841f14087cdea02fe44cdaea521ff92122aec..6959c95502cb7af6b720592e7836c6789719a528 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -85,7 +85,7 @@ TEST_P(FloatReverseTest, Reverses) {
   auto r1_literal = Literal::CreateR1<float>(input_vector);
   auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = AddParam(*input_literal, &builder);
   builder.Rev(a, spec.reversal);
 
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index d268fdcacebcb162bf61bc7dd4b208f4db6c4a5f..7015e5a6a31f506d30c2629d7735482cf354455a 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -50,7 +50,7 @@ class SelectAndScatterTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<SelectAndScatterTestParam> {
  public:
-  SelectAndScatterTest() : builder_(client_, TestName()) {
+  SelectAndScatterTest() : builder_(TestName()) {
     // Create S32 GE and ADD computations for select and scatter respectively.
     ge_s32_ = CreateScalarGeComputation(S32, &builder_);
     add_s32_ = CreateScalarAddComputation(S32, &builder_);
@@ -60,13 +60,13 @@ class SelectAndScatterTest
     min_f32_ = CreateScalarMinComputation(F32, &builder_);
   }
 
-  ComputationBuilder builder_;
-  Computation ge_s32_;
-  Computation add_s32_;
-  Computation ge_f32_;
-  Computation add_f32_;
-  Computation max_f32_;
-  Computation min_f32_;
+  XlaBuilder builder_;
+  XlaComputation ge_s32_;
+  XlaComputation add_s32_;
+  XlaComputation ge_f32_;
+  XlaComputation add_f32_;
+  XlaComputation max_f32_;
+  XlaComputation min_f32_;
 };
 
 XLA_TEST_P(SelectAndScatterTest, ParamTest) {
@@ -80,12 +80,11 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
   s.FillRandom(12.0f);
   auto source = builder_.ConstantFromArray(s);
 
-  auto select_and_scatter = builder_.SelectAndScatter(
-      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
-      add_f32_);
+  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
+                            GetParam().window_strides, GetParam().padding_type,
+                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 69fbe98bd63661322d37936c90a5fe3580efc2de..52195db2aa74710b901dd7744a670764a034e96b 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -299,9 +299,12 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestBigSlicesInstantiation,
     SliceR1LargeTest,
     ::testing::Values(
-          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-          R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
     ),
     SliceR1TestDataToString
 );
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 68f75d50cbaa9b6eab56714987196cc9aced60a9..cda1989fad670c805f30b5043e342d5f9a9a6fe2 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -165,7 +165,7 @@ enum class ConstantType { kUnknown, kZero, kOne };
 // Return the constant type required by this computation, if known.
 ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  if (computation.num_parameters() != 2 ||
+  if (computation.num_parameters() != 2 || root->operand_count() != 2 ||
       root->operand(0)->opcode() != HloOpcode::kParameter ||
       root->operand(1)->opcode() != HloOpcode::kParameter ||
       root->operand(0) == root->operand(1)) {
@@ -340,8 +340,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 }
 
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module) {
-  return HloVerifier().Run(module).status();
+                       HloModule* const module, bool allow_mixed_precision) {
+  return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 0fb024ffb074f1c90b75022bc7f5a8b58b03c0c2..b5ab779574fd5237d14cd24c345a9d5f1d41d1fd 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -69,7 +69,8 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
 // Check that a given module satisfies various constraints before trying to
 // execute it.
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
-                       HloModule* const module);
+                       HloModule* const module,
+                       bool allow_mixed_precision = false);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8efc6e2a83f42bf81fc1261ba508632cf3f85b3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// A test fixture is used because we need a client for our computation builder.
+class TestUtilsTest : public LocalClientTestBase {};
+
+XLA_TEST_F(TestUtilsTest, UnusedParam) {
+  ComputationBuilder builder(local_client_, TestName());
+  // Make the reduction lambda.
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "unused");
+  builder.Parameter(1, single_float, "used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  // Make the reduction.
+  Shape pair_float = ShapeUtil::MakeShape(F32, {2});
+  builder.Reduce(builder.Parameter(0, pair_float, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+  computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  auto executable_status = local_client_->Compile(
+      computation_status.ValueOrDie(), {&pair_float, &single_float},
+      ExecutableBuildOptions());
+  TF_ASSERT_OK(executable_status.status());
+  HloModule& module = const_cast<HloModule&>(
+      executable_status.ValueOrDie()->executable()->module());
+  TF_ASSERT_OK(MakeFakeArguments(&module).status());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 0af40bc15a41f7c4ef6382b1a94412afe5741a86..a9f2915b458b1816926de727b3da21982d06f6c0 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -32,14 +33,15 @@ GTEST_API_ int main(int argc, char** argv) {
   // tests.
   for (int i = 1; i < argc; i++) {
     tensorflow::StringPiece arg(argv[i]);
-    if (arg == "--benchmarks" || arg.starts_with("--benchmarks=")) {
+    if (arg == "--benchmarks" ||
+        tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
       const char* pattern = nullptr;
-      if (arg.starts_with("--benchmarks=")) {
+      if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
         pattern = argv[i] + strlen("--benchmarks=");
       } else {
         // Handle flag of the form '--benchmarks foo' (no '=').
         if (i + 1 >= argc ||
-            tensorflow::StringPiece(argv[i + 1]).starts_with("--")) {
+            tensorflow::str_util::StartsWith(argv[i + 1], "--")) {
           LOG(ERROR) << "--benchmarks flag requires an argument.";
           return 2;
         }
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e60a5a4919f2207939821e787c3c59a08ff3ba4e..b2f122982adf750106f034e7e786367720ebafcf 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -724,6 +724,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], *broadcast_dimensions));
       break;
     }
+    case HloOpcode::kBroadcastDimOne: {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateBroadcastDimOne(shape, operands[0]));
+      break;
+    }
     case HloOpcode::kConcatenate: {
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index adc8b1d620eb65fdca19072831360b71847abf9e..57684b58346166f7e3ef9576f6cd8f70ab9dc389 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -57,6 +57,18 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// broadcast size-one dimensions
+{
+"BroadcastDimOne",
+R"(HloModule broadcast_dim_one_module
+
+ENTRY %broadcast-dim-one () -> f32[2,2] {
+  %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } })
+  ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant)
+}
+
 )"
 },
 // pred constant
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 6b136d333bbf079efd314833f46fe3b98743fbac..1439f1bcc5cec39203a7cb4b1f8604e7349382c6 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -6,7 +6,9 @@ load("//tensorflow/core:platform/default/build_config_root.bzl",
      "if_static")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
+def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
+  if kwargs.get('use_grpc_plugin'):
+    kwargs['use_grpc_namespace'] = True
   cc_proto_library(name=name,
                    srcs=srcs,
                    deps=deps,
@@ -16,6 +18,13 @@ def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
                    ),
                    protoc="@protobuf_archive//:protoc",
                    testonly=testonly,
-                   visibility=visibility,)
+                   visibility=visibility,
+                   **kwargs)
+
+def xla_py_grpc_library(**kwargs):
+  # Note: we don't currently define any special targets for Python GRPC in OSS.
+  _ignore = kwargs
+  pass
+
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f9943f71d31627a95ccb2dbbf77a1b203eff79b4..f619b8dc24038af64a27fc0565c74447ca9d09cf 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -192,6 +192,9 @@ message DebugOptions {
   // Generate calls to MKL-DNN in the CPU backend.
   bool xla_cpu_use_mkl_dnn = 97;
 
+  // Maximum kernel unroll factor for the GPU backend.
+  int32 xla_gpu_max_kernel_unroll_factor = 98;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -417,6 +420,11 @@ message ComputeConstantRequest {
   repeated LiteralProto parameters = 4;
 }
 
+message ComputeConstantGraphRequest {
+  HloModuleProto computation = 1;
+  Layout output_layout = 2;
+}
+
 message ComputeConstantResponse {
   // A LiteralProto is returned directly for this request, instead of a
   // ComputationDataHandle.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 1f16e6d25178fd9c10a30b0c500e090ee2e08117..f18d53c6089e8d4411099be8fb0fb8c349ace4f7 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -355,17 +355,19 @@ message WindowDimension {
   // positions of the window in this dimension.
   int64 stride = 2;
 
-  // If positive, means the amount of padding with zeroes to add to the base
-  // area at the low end of this dimension; if negative, its negative means the
-  // number of elements removed from the low end of this dimension. For example,
-  // in the horizontal dimension of a rectangle, this would be the number of
-  // zeroes to pad on the left, given that indices increase when going right.
+  // If positive, means the amount of padding to add to the base area at the low
+  // end of this dimension; if negative, its negative means the number of
+  // elements removed from the low end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of padding
+  // values to pad on the left, given that indices increase when going right.
+  // The actual padding value depends upon the context. Convolution pads with
+  // zeros. ReduceWindow and SelectAndScatter pads with the reduce function's
+  // init value.
   int64 padding_low = 3;
 
-  // As padding_low, but on the high end of this dimension. For
-  // example, in the horizontal dimension of a rectangle, this would
-  // be the number of zeroes to pad on the right, given that indices
-  // increase when going right.
+  // As padding_low, but on the high end of this dimension. For example, in the
+  // horizontal dimension of a rectangle, this would be the number of values to
+  // pad on the right, given that indices increase when going right.
   int64 padding_high = 4;
 
   // Dilation factor of the sliding window in this dimension. A dilation factor
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index bf69144ad83c9b5f9a51d4c9e6fbfe61b5f16fb2..9bef0d8b61ebe8ae65c991c7e414f8f6e58f10d5 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -81,6 +81,7 @@ py_library(
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
+        "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 1c5b00f92eace598dea5f035e4954b4b2de8da0e..aaddb06fa0c22d6162815dc2dbf24e6dc79c0df8 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -66,6 +66,7 @@ from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
+from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 61f6bfd7e733fc3e2e0bea35a955509c39d57bc9..b9088026c1eba381ba8bd7218594a36e8a2bd6d1 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -23,7 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import namedtuple
-import types
 
 import gast
 
@@ -114,7 +113,7 @@ class CallTreeTransformer(transformer.Base):
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
     # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_entity, types.BuiltinFunctionType)
+    return not inspect_utils.isbuiltin(target_entity)
 
   def _should_compile(self, node, fqn):
     """Determines whether an entity should be compiled in the context."""
@@ -147,7 +146,7 @@ class CallTreeTransformer(transformer.Base):
       # Inspect the target function decorators. If any include a @convert
       # or @graph_ready annotation, then they must be called as they are.
       # TODO(mdan): This may be quite heavy.
-      # To parse and re-analize each function for every call site could be quite
+      # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
         target_node, _ = parser.parse_entity(target_entity)
diff --git a/tensorflow/contrib/autograph/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
index c666dcb73b232ce443898cfe3359f74605af98f2..303dd54a4ee49de27fad0c5cdc2d6274abfe0fa8 100644
--- a/tensorflow/contrib/autograph/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -34,7 +34,7 @@ class CallTreesTest(converter_test_base.TestCase):
   def test_basic(self):
 
     def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled verison.')
+      raise ValueError('This should not be called in the compiled version.')
 
     def renamed_test_fn_1(a):
       return a + 1
diff --git a/tensorflow/contrib/autograph/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
index e67ab1cd6a15ceb66fe75140419c7abca9653ae4..9c01f689127dbedad7669c65b03e7da071b2d64d 100644
--- a/tensorflow/contrib/autograph/converters/decorators_test.py
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 # The Python parser only briefly captures decorators into the AST.
 # The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is notmally what you would expect, since
+# trace of the decorator (which is normally what you would expect, since
 # they are meant to be transparent).
 # However, decorators are still visible when you analyze the function
 # from inside a decorator, before it was applied - as is the case
diff --git a/tensorflow/contrib/autograph/converters/name_scopes.py b/tensorflow/contrib/autograph/converters/name_scopes.py
index 2a3f474360e94635470bf9581222e4c79f46b7a1..dfee529abaa8c14d9b408819b32c5199500a2c2f 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Wraps a function body with a `name_scope` of the function name.
-"""
+"""Wraps a function body with a `name_scope` of the function name."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,23 +27,46 @@ from tensorflow.contrib.autograph.pyct import transformer
 class FunctionNameScopeTransformer(transformer.Base):
   """Wrap a function body with a `name_scope` of the function name."""
 
-  def __init__(self, context):
-    super(FunctionNameScopeTransformer, self).__init__(context)
-    self._function_level = 0
+  def _name_for_current_scope(self):
+    innermost = self.enclosing_entities[-1]
+    if len(self.enclosing_entities) > 1:
+      parent = self.enclosing_entities[-2]
+      if isinstance(parent, gast.ClassDef):
+        # Methods also take the name of their class.
+        name = '%s/%s' % (parent.name, innermost.name)
+      else:
+        name = innermost.name
+    else:
+      name = innermost.name
+
+    # Sanitize the name.
+    # See https://www.tensorflow.org/api_docs/python/tf/Graph#name_scope
+    # TensorFlow doesn't like leading underscores at the top level.
+    while name[0] == '_':
+      name = name[1:]
+    return name
 
   def visit_FunctionDef(self, node):
-    self._function_level += 1
-    try:
-      self.generic_visit(node)
-    finally:
-      self._function_level -= 1
-    scope_name = node.name
-    if self._function_level == 0 and self.context.owner_type is not None:
-      scope_name = '{}/{}'.format(self.context.owner_type.__name__, scope_name)
-    node.body = templates.replace(
-        'with tf.name_scope(scope_name): body',
-        scope_name=gast.Str(scope_name),
-        body=node.body)
+    node = self.generic_visit(node)
+
+    unscoped_body = []
+    scoped_body = node.body
+    if scoped_body:
+      first = scoped_body[0]
+      if isinstance(first, gast.Expr) and isinstance(first.value, gast.Str):
+        # Skip any docstring.
+        unscoped_body = scoped_body[:1]
+        scoped_body = scoped_body[1:]
+
+    template = """
+      with tf.name_scope(scope_name):
+        body
+    """
+    scoped_body = templates.replace(
+        template,
+        scope_name=gast.Str(self._name_for_current_scope()),
+        body=scoped_body)
+    node.body = unscoped_body + scoped_body
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/name_scopes_test.py b/tensorflow/contrib/autograph/converters/name_scopes_test.py
index 61e5db2af826d0c2238f1af0f3240411596f7429..17692cbd880dbc1db4bb40ad7345e27907499f9d 100644
--- a/tensorflow/contrib/autograph/converters/name_scopes_test.py
+++ b/tensorflow/contrib/autograph/converters/name_scopes_test.py
@@ -27,9 +27,10 @@ from tensorflow.python.platform import test
 
 class FunctionNameScopeTransformer(converter_test_base.TestCase):
 
-  def test_basic_name(self):
+  def test_basic(self):
 
     def test_fn(l):
+      """This should stay here."""
       a = 5
       l += a
       return l
@@ -38,41 +39,62 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', result_op.op.name)
 
-  def test_nested_name(self):
+      self.assertEqual('This should stay here.', result.test_fn.__doc__)
+
+  def test_long_docstring(self):
 
     def test_fn(l):
+      """Multi-line docstring.
+
+      Args:
+        l: A thing.
+      Returns:
+        l
+      """
+      return l
 
-      def body(i):
-        return i**2
+    node = self.parse_and_analyze(test_fn, {})
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      self.assertIn('Multi-line', result.test_fn.__doc__)
+      self.assertIn('Returns:', result.test_fn.__doc__)
 
-      l += [4]
-      return body(l)
+  def test_nested_functions(self):
+
+    def test_fn(l):
+
+      def inner_fn(i):
+        return i ** 2
+
+      l += 4
+      return inner_fn(l)
 
     node = self.parse_and_analyze(test_fn, {})
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.test_fn(constant_op.constant(1))
       first_result_input_name = result_op.op.inputs[0].name
       second_result_input_name = result_op.op.inputs[1].name
       self.assertIn('test_fn/', first_result_input_name)
-      self.assertNotIn('body/', first_result_input_name)
-      self.assertIn('test_fn/body/', second_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('test_fn/inner_fn/', second_result_input_name)
 
-  def test_class_name(self):
+  def test_method(self):
 
     class TestClass(object):
 
       def test_fn(self, l):
 
-        def body(i):
-          return i**2
+        def inner_fn(i):
+          return i ** 2
 
-        l += [4]
-        return body(l)
+        l += 4
+        return inner_fn(l)
 
     # Note that 'TestClass' was needed in the namespace here.
     node = self.parse_and_analyze(
@@ -80,12 +102,37 @@ class FunctionNameScopeTransformer(converter_test_base.TestCase):
     node = name_scopes.transform(node, self.ctx)
 
     with self.compiled(node, ops.name_scope) as result:
-      result_op = result.TestClass().test_fn(constant_op.constant([1, 2, 3]))
+      result_op = result.TestClass().test_fn(constant_op.constant(1))
       first_result_input_name = result_op.op.inputs[0].name
       second_result_input_name = result_op.op.inputs[1].name
       self.assertIn('TestClass/test_fn/', first_result_input_name)
-      self.assertNotIn('body/', first_result_input_name)
-      self.assertIn('TestClass/test_fn/body/', second_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('TestClass/test_fn/inner_fn/', second_result_input_name)
+
+  def test_operator(self):
+
+    class TestClass(object):
+
+      def __call__(self, l):
+
+        def inner_fn(i):
+          return i ** 2
+
+        l += 4
+        return inner_fn(l)
+
+    # Note that 'TestClass' was needed in the namespace here.
+    node = self.parse_and_analyze(
+        TestClass.__call__, {'TestClass': TestClass}, owner_type=TestClass)
+    node = name_scopes.transform(node, self.ctx)
+
+    with self.compiled(node, ops.name_scope) as result:
+      result_op = result.__call__(TestClass(), constant_op.constant(1))
+      first_result_input_name = result_op.op.inputs[0].name
+      second_result_input_name = result_op.op.inputs[1].name
+      self.assertIn('call__/', first_result_input_name)
+      self.assertNotIn('inner_fn', first_result_input_name)
+      self.assertIn('call__/inner_fn/', second_result_input_name)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..7f5e4d4ac124f3e9834a87193da110160926e77e
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb
@@ -0,0 +1,1421 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "LqNpENf-ec0X",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Pa2qpEmoVOGe",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HNqUFL4deCsL",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "# Case study: building an RNN\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "YkC1k4HEQ7rw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "In this section, we show how you can use AutoGraph to build RNNColorbot, an RNN that takes as input names of colors and predicts their corresponding RGB tuples. The model will be trained by a [custom Estimator](https://www.tensorflow.org/get_started/custom_estimators)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7nkPDl5CTCNb",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To get started, set up the dataset. The following cells defines methods that download and format the data needed for RNNColorbot; the details aren't important (read them in the privacy of your own home if you so wish), but make sure to run the cells before proceeding."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "A0uREmVXCQEw",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(\n",
+        "      batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "waZ89t3DTUla",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n",
+        "\n",
+        "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "9v8AJouiC44V",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "class RnnColorbot(object):\n",
+        "  \"\"\"Holds the parameters of the colorbot model.\"\"\"\n",
+        "\n",
+        "  def __init__(self):\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "\n",
+        "    self.lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "    self.relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  for ch in chars:\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "@autograph.convert(recursive=True)\n",
+        "def model(inputs, colorbot, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    colorbot: An object of type RnnColorbot\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  seq = tf.transpose(chars, [1, 0, 2])\n",
+        "  seq.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n",
+        "  seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(seq, indices)\n",
+        "  return colorbot.relu_layer(sequence_ends)\n",
+        "\n",
+        "@autograph.convert()\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "JjK4gXFvFsf4",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "We will now create the model function for the estimator.\n",
+        "\n",
+        "In the model function, we simply call the converted functions that we defined above - that's it!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "-yso_Nx23Gy1",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def model_fn(features, labels, mode, params):\n",
+        "  \"\"\"Estimator model function.\"\"\"\n",
+        "  chars = features['chars']\n",
+        "  sequence_length = features['sequence_length']\n",
+        "  inputs = (chars, sequence_length)\n",
+        "\n",
+        "  # Create the model components.\n",
+        "  # Simply calling the AutoGraph-ed functions and objects just works!\n",
+        "  colorbot = RnnColorbot()\n",
+        "  \n",
+        "  batch_size = params['batch_size']\n",
+        "\n",
+        "  if mode == tf.estimator.ModeKeys.TRAIN:\n",
+        "    predictions = model(inputs, colorbot, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    learning_rate = params['learning_rate']\n",
+        "    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
+        "    global_step = tf.train.get_global_step()\n",
+        "    train_op = optimizer.minimize(loss, global_step=global_step)\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n",
+        "\n",
+        "  elif mode == tf.estimator.ModeKeys.EVAL:\n",
+        "    predictions = model(inputs, colorbot, batch_size, training=False)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "\n",
+        "    return tf.estimator.EstimatorSpec(mode, loss=loss)\n",
+        "  \n",
+        "  elif mode == tf.estimator.ModeKeys.PREDICT:\n",
+        "    # For prediction, we expect single tensors.\n",
+        "    predictions = model(inputs, colorbot, 1, training=False)\n",
+        "\n",
+        "    predictions = tf.minimum(predictions, 1.0)\n",
+        "    return tf.estimator.EstimatorSpec(mode, predictions=predictions)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "HOQfoBnHC9CP",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We'll create an input function that will feed our training and eval data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "FJZlx7yG2MP0",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def input_fn(data_dir, data_url, params, training=True):\n",
+        "  \"\"\"An input function for training\"\"\"\n",
+        "  batch_size = params['batch_size']\n",
+        "  \n",
+        "  # load_dataset defined above\n",
+        "  dataset = load_dataset(data_dir, data_url, batch_size, training=training)\n",
+        "\n",
+        "  # Package the pipeline end in a format suitable for the estimator.\n",
+        "  labels, chars, sequence_length = dataset.make_one_shot_iterator().get_next()\n",
+        "  features = {\n",
+        "      'chars': chars,\n",
+        "      'sequence_length': sequence_length\n",
+        "  }\n",
+        "\n",
+        "  return features, labels"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qsvv-lzbDqXd",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "source": [
+        "We now have everything in place to build our custom estimator and use it for training and eval!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 35
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 10064,
+          "status": "ok",
+          "timestamp": 1523580419240,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "2pg1AfbxBJQq",
+        "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210",
+        "slideshow": {
+          "slide_type": "-"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Eval loss at step 100: 0.0665446\n"
+          ]
+        }
+      ],
+      "source": [
+        "params = {\n",
+        "    'batch_size': 64,\n",
+        "    'learning_rate': 0.01,\n",
+        "}\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\"\n",
+        "\n",
+        "regressor = tf.estimator.Estimator(\n",
+        "    model_fn=model_fn,\n",
+        "    params=params)\n",
+        "\n",
+        "regressor.train(\n",
+        "    input_fn=lambda: input_fn(data_dir, train_url, params),\n",
+        "    steps=100)\n",
+        "eval_results = regressor.evaluate(\n",
+        "    input_fn=lambda: input_fn(data_dir, test_url, params, training=False),\n",
+        "    steps=2\n",
+        ")\n",
+        "\n",
+        "print('Eval loss at step %d: %s' % (eval_results['global_step'], eval_results['loss']))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zG1YAjB_cUnQ",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "source": [
+        "And here's the same estimator used for inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 343
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 31286,
+          "status": "ok",
+          "timestamp": 1523580450579,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "dxHex2tUN_10",
+        "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8",
+        "slideshow": {
+          "slide_type": "slide"
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_a0db480422"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d2a46ea291"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_0a8262c6e9"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e32f85ccd2"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_eaee748b21"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_2befe06587"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_8ec4aeeb25"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_9f9f4574f1"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_bcccd8f300"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_2c056cee72"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_c853c3f58b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_e5730ab00d"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_a897ef7e24"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_565fa3d154"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_222e0dc6af"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_831db7458f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_adb576c6eb"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_9418f2d32f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_3fad25f306"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_45b9340e7b"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_bec9896d44"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_460b91ad4a"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_7dedd0b037"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_4b1c977dc7"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_d64fedfcf9"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_3e8c929c3f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_9f9cf2b76f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_b402e6b587"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_9b7d66db72"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_11ec213a3f"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_9c055e4bc0"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_ba6a061307"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_83e3496927"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_f437bab20d"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_93aa63450e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_aca189bea5"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
+              "//# sourceURL=js_5df1fe383e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_c62c7174ad"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n",
+              "//# sourceURL=js_2e2201ddc4"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n",
+              "//# sourceURL=js_288e5283d6"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n",
+              "//# sourceURL=js_2f31d19cde"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n",
+              "//# sourceURL=js_2fbbcda050"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          },
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": [
+              "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n",
+              "//# sourceURL=js_f94d975cf3"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "def predict_input_fn(color_name):\n",
+        "  \"\"\"An input function for prediction.\"\"\"\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  \n",
+        "  # We create a batch of a single element.\n",
+        "  features = {\n",
+        "      'chars': tf.expand_dims(chars, 0),\n",
+        "      'sequence_length': tf.expand_dims(sequence_length, 0)\n",
+        "  }\n",
+        "  return features, None\n",
+        "\n",
+        "\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def predict_with_estimator(color_name, regressor):\n",
+        "  predictions = regressor.predict(\n",
+        "      input_fn=lambda:predict_input_fn(color_name))\n",
+        "  pred = next(predictions)\n",
+        "  predictions.close()\n",
+        "  pred = np.minimum(pred, 1.0)\n",
+        "  pred = np.expand_dims(np.expand_dims(pred, 0), 0)\n",
+        "\n",
+        "  draw_prediction(color_name, pred)\n",
+        "\n",
+        "tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "while True:\n",
+        "  with tb.output_to(0):\n",
+        "    try:\n",
+        "      color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "    except (EOFError, KeyboardInterrupt):\n",
+        "      break\n",
+        "  if not color_name:\n",
+        "    break\n",
+        "  with tb.output_to(0):\n",
+        "    tb.clear_tab()\n",
+        "    predict_with_estimator(color_name, regressor)\n",
+        "  "
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "RNN Colorbot using Estimators",
+      "provenance": [
+        {
+          "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl",
+          "timestamp": 1523579810961
+        },
+        {
+          "file_id": "1DcfimonWU11tmyivKBGVrbpAl3BIOaRG",
+          "timestamp": 1523016192637
+        },
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index dce994e50df60d8bd419f62207d77035beac9f5a..3c3130c77025c45ca219daf4bb66082f4e8a7f82 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -49,7 +49,7 @@ def convert(recursive=False, verbose=False, arg_types=None):
   function is called. This means the parameter values are known at compilation.
 
   Args:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_types: See to_graph.
@@ -137,7 +137,7 @@ def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
 
   unknown_arg_value = object()  # Sentinel for arguments of unknown value
 
-  if tf_inspect.isbuiltin(f):
+  if inspect_utils.isbuiltin(f):
     return builtins.dynamic_builtin(f, *args, **kwargs)
 
   if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
@@ -156,7 +156,7 @@ def converted_call(f, recursive, verbose, arg_types, *args, **kwargs):
     # Constructors
     target_entity = f
     arg_map_target = f.__init__
-    effective_args = (unknown_arg_value,) + args
+    effective_args = args
     partial_types = ()
 
   elif hasattr(f, '__call__') and hasattr(f, '__class__'):
@@ -215,7 +215,7 @@ def to_graph(e,
 
   Args:
     e: A Python entity.
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_values: A dict containing value hints for symbols like function
@@ -235,7 +235,8 @@ def to_graph(e,
       nocompile_decorators=(convert, do_not_convert, converted_call),
       partial_types=partial_types,
       api_module=tf_inspect.getmodule(to_graph))
-  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
+  _, name, namespace = conversion.entity_to_graph(e, conversion_map, arg_values,
+                                                  arg_types)
 
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
@@ -244,13 +245,12 @@ def to_graph(e,
     module.body.append(dep)
   compiled_node, compiled_src = compiler.ast_to_object(module)
 
-  # The compiled code should see everything the entry function saw.
+  # The compiled code should see everything the entry entity saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
-  if tf_inspect.isfunction(e):
-    for key, val in inspect_utils.getnamespace(e).items():
-      # Avoid overwriting entities that have been transformed.
-      if key not in compiled_node.__dict__:
-        compiled_node.__dict__[key] = val
+  for key, val in namespace.items():
+    # Avoid overwriting entities that have been transformed.
+    if key not in compiled_node.__dict__:
+      compiled_node.__dict__[key] = val
   compiled_fn = getattr(compiled_node, name)
 
   if verbose:
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index f9db07778a33498f699923a9e0a193c843bfefd8..a7737b7f448131b1c54951efa719b481e1f4d0c9 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -39,8 +39,6 @@ class ApiTest(test.TestCase):
         'from __future__ import print_function',
         'from tensorflow.contrib.autograph import utils'
         ' as autograph_utils',
-        'from tensorflow.contrib.autograph import operators'
-        ' as __ops',
         'tf = autograph_utils.fake_tf()',
     )
 
@@ -179,6 +177,92 @@ class ApiTest(test.TestCase):
           constant_op.constant(-2))
       self.assertListEqual([0, 1], sess.run(x).tolist())
 
+  def test_converted_call_builtin(self):
+    x = api.converted_call(range, False, False, {}, 3)
+    self.assertEqual((0, 1, 2), tuple(x))
+
+  def test_converted_call_function(self):
+
+    def test_fn(x):
+      if x < 0:
+        return -x
+      return x
+
+    with self.test_session() as sess:
+      x = api.converted_call(
+          test_fn, False, False, {}, constant_op.constant(-1))
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_method_by_class(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(TestClass.test_method, False, False, {}, tc)
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_callable_object(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def __call__(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = TestClass(constant_op.constant(-1))
+      x = api.converted_call(tc, False, False, {})
+      self.assertEqual(1, sess.run(x))
+
+  def test_converted_call_constructor(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def test_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+    with self.test_session() as sess:
+      tc = api.converted_call(
+          TestClass, False, False, {}, constant_op.constant(-1))
+      # tc is now a converted object.
+      x = tc.test_method()
+      self.assertEqual(1, sess.run(x))
+
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 26326465e265f5b40c3badedc0ea2813248ef60f..2600088595a12761b1138c4649c06882bd8fd000 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -46,10 +46,4 @@ NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
     'import tensorflow as tf',
-    'from tensorflow.contrib.autograph.impl import api'
-    ' as autograph_api',
-    'from tensorflow.contrib.autograph import utils'
-    ' as autograph_utils',
-    'from tensorflow.contrib.autograph import operators'
-    ' as __ops',
 )
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 3bacc9430098d9cebf1726074524731899cdd965..bcf31b8961ece1cb0ddf9157456db15deb164b2e 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import gast
 
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.converters import asserts
 from tensorflow.contrib.autograph.converters import break_statements
@@ -56,7 +57,7 @@ class ConversionMap(object):
   This object is mutable, and is updated as functions are converted.
 
   Attributes:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
@@ -138,20 +139,22 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
         parameters.
 
   Returns:
-    A tuple (ast, new_name):
+    A tuple (ast, new_name, namespace):
         * ast: An AST representing an entity with interface equivalent to `o`,
             but which when executed it creates TF a graph.
         * new_name: The symbol name under which the new entity can be found.
+        * namespace: A dict mapping all symbols visible to the converted entity,
+            keyed by their symbol name.
 
   Raises:
     ValueError: if the entity type is not supported.
   """
   if tf_inspect.isclass(o):
-    node, new_name = class_to_graph(o, conversion_map)
+    node, name, ns = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
     raise ValueError(
         'Entity "%s" has unsupported type "%s". Only functions and classes are '
@@ -174,7 +177,7 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types):
         continue
       entity_to_graph(candidate, conversion_map, {}, {})
 
-  return node, new_name
+  return node, name, ns
 
 
 def class_to_graph(c, conversion_map):
@@ -185,17 +188,18 @@ def class_to_graph(c, conversion_map):
   if not members:
     raise ValueError('Cannot convert %s: it has no member methods.' % c)
 
-  class_namespace = None
+  class_namespace = {}
   for _, m in members:
-    node, _ = function_to_graph(
+    node, _, namespace = function_to_graph(
         m,
         conversion_map=conversion_map,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
         owner_type=c)
-    # TODO(mdan): Do not assume all members have the same view of globals.
     if class_namespace is None:
-      class_namespace = inspect_utils.getnamespace(m)
+      class_namespace = namespace
+    else:
+      class_namespace.update(namespace)
     converted_members[m] = node
   namer = conversion_map.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
@@ -206,25 +210,23 @@ def class_to_graph(c, conversion_map):
       body=list(converted_members.values()),
       decorator_list=[])
 
-  return node, class_name
+  return node, class_name, class_namespace
+
+
+def _add_reserved_symbol(namespace, name, entity):
+  if name not in namespace:
+    namespace[name] = entity
+  elif namespace[name] != entity:
+    raise ValueError('The name "%s" is reserved and may not be used.' % name)
 
 
 def _add_self_references(namespace, api_module):
-  """Self refs are only required for analysis and are not used directly."""
   # Manually add the utils namespace which may be used from generated code.
-  if 'autograph_util' not in namespace:
-    namespace['autograph_utils'] = utils
-  elif namespace['autograph_utils'] != utils:
-    raise ValueError(
-        'The module name "autograph_utils" is reserved and may not be used.')
-
+  _add_reserved_symbol(namespace, 'autograph_utils', utils)
+  _add_reserved_symbol(namespace, '__ops', operators)
   # We also make reference to the api module for dynamic conversion, but
   # to avoid circular references we don't import it here.
-  if 'autograph_api' not in namespace:
-    namespace['autograph_api'] = api_module
-  elif namespace['autograph_api'] != api_module:
-    raise ValueError(
-        'The module name "autograph_api" is reserved and may not be used.')
+  _add_reserved_symbol(namespace, 'autograph_api', api_module)
 
 
 def function_to_graph(f, conversion_map, arg_values, arg_types,
@@ -261,7 +263,7 @@ def function_to_graph(f, conversion_map, arg_values, arg_types,
   # TODO(mdan): Use this at compilation.
   conversion_map.additional_imports.update(deps)
 
-  return node, new_name
+  return node, new_name, namespace
 
 
 def _static_analysis_pass(node, ctx):
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 7066739eb87f89ab98e906b10dab62baeaa2de8e..962009c71f51a30d77ed886ca6f4b315b3f265f5 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -43,14 +43,15 @@ class ConversionTest(test.TestCase):
       conversion.entity_to_graph('dummy', conversion_map, None, None)
 
   def test_entity_to_graph_callable(self):
-
+    b = 2
     def f(a):
-      return a
+      return a + b
 
     conversion_map = conversion.ConversionMap(True, (), (), None)
-    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
+    ast, name, ns = conversion.entity_to_graph(f, conversion_map, None, None)
     self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
-    self.assertEqual('tf__f', new_name)
+    self.assertEqual('tf__f', name)
+    self.assertTrue(ns['b'] is b)
 
   def test_entity_to_graph_call_tree(self):
 
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 5b8cb2d63c7e2987431104c7997ff612f17b8e7e..81ae64f110924cb9f8f089ced2f44bb8e3aa5135 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -83,7 +83,8 @@ def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
       while_cond,
       while_body,
       init_state=(0,) + init_state,
-      extra_deps=(iterated,))
+      extra_deps=(iterated,),
+      opts=dict(maximum_iterations=n))
   # Dropping the iteration index because it's not syntactically visible.
   results = results[1:]
 
@@ -136,7 +137,7 @@ def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
   return results
 
 
-def while_loop(loop_cond, loop_body, init_state, extra_deps):
+def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
   """Functional form of a while statement.
 
   The loop operates on a so-called state, which includes all symbols that are
@@ -153,6 +154,7 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps):
     extra_deps: Tuple containing additional entities on which the loop may
         depend, such as loop invariants referenced by loop_cond. Used
         exclusively for dispatch control.
+    opts: Optional dict of extra loop parameters.
 
   Returns:
     Tuple containing the final state.
@@ -161,18 +163,21 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps):
   # That could be somethins as simple as a collection of dispatch rules, with
   # some prioritization.
   if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
-    return _tf_while_loop(loop_cond, loop_body, init_state)
+    return _tf_while_loop(loop_cond, loop_body, init_state, opts)
   else:
-    return _py_while_loop(loop_cond, loop_body, init_state)
+    return _py_while_loop(loop_cond, loop_body, init_state, opts)
 
 
-def _tf_while_loop(loop_cond, loop_body, init_state):
+def _tf_while_loop(loop_cond, loop_body, init_state, opts):
   """Overload of while_loop that stages a TF while_loop."""
-  return control_flow_ops.while_loop(loop_cond, loop_body, init_state)
+  if opts is None:
+    opts = {}
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state, **opts)
 
 
-def _py_while_loop(loop_cond, loop_body, init_state):
+def _py_while_loop(loop_cond, loop_body, init_state, opts):
   """Overload of while_loop that executes a Python while loop."""
+  del opts
   state = init_state
   while loop_cond(*state):
     state = loop_body(*state)
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index c483ff68c4b7c6d9a3315f569b62b8f253079f00..796ab445c74128e1123e24b67c288e0e3c5ca24c 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -125,3 +125,14 @@ py_test(
         "@gast_archive//:gast",
     ],
 )
+
+py_test(
+    name = "transformer_test",
+    srcs = ["transformer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index 30a5961821ac16922b843552acdc9c15d04db8aa..63361cc4f2557d22800072d90a51b7e4ddab34ab 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -22,12 +22,25 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import types
 
 import six
 
 from tensorflow.python.util import tf_inspect
 
 
+def isbuiltin(f):
+  # Note these return false for isinstance(f, types.BuiltinFunctionType) so we
+  # need to specifically check for them.
+  if f in (range, int, float):
+    return True
+  if isinstance(f, types.BuiltinFunctionType):
+    return True
+  if tf_inspect.isbuiltin(f):
+    return True
+  return False
+
+
 def getnamespace(f):
   """Returns the complete namespace of a function.
 
@@ -50,6 +63,18 @@ def getnamespace(f):
   return namespace
 
 
+def getdefiningclass(m, owner_class):
+  """Resolves the class (e.g. one of the superclasses) that defined a method."""
+  m = six.get_unbound_function(m)
+  last_defining = owner_class
+  for superclass in tf_inspect.getmro(owner_class):
+    if hasattr(superclass, m.__name__):
+      superclass_m = getattr(superclass, m.__name__)
+      if six.get_unbound_function(superclass_m) == m:
+        last_defining = superclass
+  return last_defining
+
+
 def getmethodclass(m):
   """Resolves a function's owner, e.g. a method's class.
 
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index eda3fc13fdca7f61b6172fceddd9f90fa087ad37..cf841dae814f64583bc43a2e110f1dcf5c0d7c1f 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -234,6 +234,37 @@ class InspectUtilsTest(test.TestCase):
     c = TestCallable()
     self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
 
+  def test_getdefiningclass(self):
+    class Superclass(object):
+
+      def foo(self):
+        pass
+
+      def bar(self):
+        pass
+
+    class Subclass(Superclass):
+
+      def foo(self):
+        pass
+
+      def baz(self):
+        pass
+
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.foo, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+
+  def test_isbuiltin(self):
+    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(float))
+    self.assertTrue(inspect_utils.isbuiltin(int))
+    self.assertTrue(inspect_utils.isbuiltin(len))
+    self.assertFalse(inspect_utils.isbuiltin(function_decorator))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 6dd53091fa3a4d874e9133520a2299b8a93ba231..b6817e9d75baa47d43a62cc5d8357bc319583312 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -162,11 +162,11 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class ActivityAnalizer(transformer.Base):
+class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
   def __init__(self, context, parent_scope):
-    super(ActivityAnalizer, self).__init__(context)
+    super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
@@ -323,4 +323,4 @@ class ActivityAnalizer(transformer.Base):
 
 
 def resolve(node, context, parent_scope=None):
-  return ActivityAnalizer(context, parent_scope).visit(node)
+  return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 1e6c686b01445a86499d4f5254ea7e139e450843..65e1a8f0ea2e75a94811b2d9d3bb177f3531580e 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -108,7 +108,7 @@ class ScopeTest(test.TestCase):
     self.assertFalse(QN('a') in child.referenced)
 
 
-class ActivityAnalizerTest(test.TestCase):
+class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
     node, source = parser.parse_entity(test_fn)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index d6d9f7e1a6028d1ce8eee6c3e250a260c3bf827f..b929b35b79200b0968c9c4f26b10cda28763773a 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Annotations used by the static analizer."""
+"""Annotations used by the static analyzer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,15 +28,15 @@ class NoValue(Enum):
 
 
 class NodeAnno(NoValue):
-  """Additionnal annotations used by the static analyzer.
+  """Additional annotations used by the static analyzer.
 
   These are in addition to the basic annotations declared in anno.py.
   """
 
   # Symbols
   # These flags are boolean.
-  IS_LOCAL = 'Symbol is local to the function scope being analized.'
-  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
+  IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
   IS_MODIFIED_SINCE_ENTRY = (
       'Symbol has been explicitly replaced in the current function scope.')
 
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 35f114b6e11901a854c1d631061ae42285c0e261..b38d52c5b2c71ce70172b63160a81ec06d073dcd 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -51,6 +51,11 @@ class Base(gast.NodeTransformer):
     self._lineno = 0
     self._col_offset = 0
     self.context = context
+    self._enclosing_entities = []
+
+  @property
+  def enclosing_entities(self):
+    return tuple(self._enclosing_entities)
 
   def debug_print(self, node):
     """Helper method useful for debugging."""
@@ -61,13 +66,20 @@ class Base(gast.NodeTransformer):
   def visit(self, node):
     source_code = self.context.source_code
     source_file = self.context.source_file
+    did_enter_function = False
+
     try:
+      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+        self._enclosing_entities.append(node)
+        did_enter_function = True
+
       if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
         return node
       return super(Base, self).visit(node)
+
     except (ValueError, AttributeError, KeyError, NotImplementedError,
             AssertionError) as e:
       msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
@@ -82,3 +94,6 @@ class Base(gast.NodeTransformer):
                       msg,
                       (source_file, self._lineno, self._col_offset + 1, line)),
                   sys.exc_info()[2])
+    finally:
+      if did_enter_function:
+        self._enclosing_entities.pop()
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f1c31ef65690806894b1550954f93e1a2a129a
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -0,0 +1,97 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for templates module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.python.platform import test
+
+
+class TransformerTest(test.TestCase):
+
+  def test_entity_scope_tracking(self):
+
+    class TestTransformer(transformer.Base):
+
+      # The choice of note to assign to is arbitrary. Using Assign because it's
+      # easy to find in the tree.
+      def visit_Assign(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+      # This will show up in the lambda function.
+      def visit_BinOp(self, node):
+        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
+        return self.generic_visit(node)
+
+    tr = TestTransformer(
+        context.EntityContext(
+            namer=None,
+            source_code=None,
+            source_file=None,
+            namespace=None,
+            arg_values=None,
+            arg_types=None,
+            owner_type=None,
+            recursive=False))
+
+    def test_function():
+      a = 0
+
+      class TestClass(object):
+
+        def test_method(self):
+          b = 0
+          def inner_function(x):
+            c = 0
+            d = lambda y: (x + y)
+            return c, d
+          return b, inner_function
+      return a, TestClass
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+
+    test_function_node = node.body[0]
+    test_class = test_function_node.body[1]
+    test_method = test_class.body[0]
+    inner_function = test_method.body[1]
+    lambda_node = inner_function.body[1].value
+
+    a = test_function_node.body[0]
+    b = test_method.body[0]
+    c = inner_function.body[0]
+    lambda_expr = lambda_node.body
+
+    self.assertEqual(
+        (test_function_node,), anno.getanno(a, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method),
+                     anno.getanno(b, 'enclosing_entities'))
+    self.assertEqual(
+        (test_function_node, test_class, test_method, inner_function),
+        anno.getanno(c, 'enclosing_entities'))
+    self.assertEqual((test_function_node, test_class, test_method,
+                      inner_function, lambda_node),
+                     anno.getanno(lambda_expr, 'enclosing_entities'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 7fbb7c09d81ff3d3916d48d3c48e377b90a25907..0a0e72d70e973bfbdaaa88c6dc857ec5784b95e4 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -84,7 +84,7 @@ def is_tf_print_compatible(value):
 
 
 def dynamic_print(*values):
-  """Implementartion of print using dynamic dispatch.
+  """Implementation of print using dynamic dispatch.
 
   The function attempts to use tf.Print if all the values are compatible.
   Otherwise, it will fall back to py_func.
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 17e20c4b315bab8852c90788567a2f2f92119f40..8cff1a3bb1d11aff6a264636291a7149b40de516 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -28,12 +28,13 @@ py_library(
     srcs = ["model.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
     ],
 )
 
@@ -51,6 +52,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "estimator_utils",
+    srcs = ["estimator_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/learn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "trainer_hooks_test",
     size = "small",
@@ -118,6 +131,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
         "//tensorflow/contrib/learn",
@@ -130,6 +144,7 @@ py_library(
     srcs = ["dnn_tree_combined_estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":estimator_utils",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
@@ -159,3 +174,22 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
+
+py_test(
+    name = "estimator_test",
+    size = "medium",
+    srcs = ["estimator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":estimator",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index d9b0d89a03dce40d34f76bb1262d26bb587a2dc7..62f1f4122b05b56a708823df4246d618bd3fa5d4 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -39,7 +39,8 @@ _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
 def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
-                                export_input_fn):
+                                export_input_fn,
+                                use_core_columns=False):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -58,7 +59,7 @@ def make_custom_export_strategy(name,
   input_fn = export_input_fn()
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns)
+       input_fn.features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 2e7b8cba05b89feaac3f47e13d26e7ae37a7b0ae..9994c84ebdb930eea0818188225488eb5eca84eb 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -19,25 +19,20 @@ logits of the DNN. The input layer of the DNN (including the embeddings learned
 over sparse features) can optionally be provided to the boosted trees as
 an additional input feature.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.layers.python.layers import optimizers
-from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
-from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output
 from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -48,56 +43,8 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
 
-
 _DNN_LEARNING_RATE = 0.001
 
-_CORE_MODE_TO_CONTRIB_MODE_ = {
-    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
-    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
-    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
-}
-
-
-def _core_mode_to_contrib_mode(mode):
-  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
-
-
-def _export_outputs_to_output_alternatives(export_outputs):
-  """Converts EstimatorSpec.export_outputs to output_alternatives.
-
-  Args:
-    export_outputs: export_outputs created by create_estimator_spec.
-  Returns:
-    converted output_alternatives.
-  """
-  output = dict()
-  if export_outputs is not None:
-    for key, value in export_outputs.items():
-      if isinstance(value, export_output.ClassificationOutput):
-        exported_predictions = {
-            prediction_key.PredictionKey.SCORES: value.scores,
-            prediction_key.PredictionKey.CLASSES: value.classes
-        }
-        output[key] = (constants.ProblemType.CLASSIFICATION,
-                       exported_predictions)
-    return output
-  return None
-
-
-def _estimator_spec_to_model_fn_ops(estimator_spec, is_regression):
-  alternatives = []
-  if not is_regression:
-    _export_outputs_to_output_alternatives(estimator_spec.export_outputs)
-
-  return model_fn.ModelFnOps(
-      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
-      predictions=estimator_spec.predictions,
-      loss=estimator_spec.loss,
-      train_op=estimator_spec.train_op,
-      eval_metric_ops=estimator_spec.eval_metric_ops,
-      output_alternatives=alternatives)
-
-
 def _get_optimizer(optimizer):
   if callable(optimizer):
     return optimizer()
@@ -128,8 +75,7 @@ def _dnn_tree_combined_model_fn(features,
                                 dnn_steps_to_train=10000,
                                 tree_feature_columns=None,
                                 tree_center_bias=False,
-                                use_core_versions=False,
-                                is_regression=False):
+                                use_core_versions=False):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -169,7 +115,6 @@ def _dnn_tree_combined_model_fn(features,
       first fitting the bias.
     use_core_versions: Whether feature columns and loss are from the core (as
       opposed to contrib) version of tensorflow.
-    is_regression: Whether the problem is regression or not.
 
   Returns:
     A `ModelFnOps` object.
@@ -305,8 +250,8 @@ def _dnn_tree_combined_model_fn(features,
         labels=labels,
         train_op_fn=_dnn_train_op_fn,
         logits=dnn_logits)
-    dnn_train_op = _estimator_spec_to_model_fn_ops(dnn_train_op,
-                                                   is_regression).train_op
+    dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        dnn_train_op).train_op
 
     tree_train_op = head.create_estimator_spec(
         features=tree_features,
@@ -314,10 +259,10 @@ def _dnn_tree_combined_model_fn(features,
         labels=labels,
         train_op_fn=_tree_train_op_fn,
         logits=tree_train_logits)
-    tree_train_op = _estimator_spec_to_model_fn_ops(tree_train_op,
-                                                    is_regression).train_op
+    tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+        tree_train_op).train_op
 
-    model_fn_ops = _estimator_spec_to_model_fn_ops(model_fn_ops, is_regression)
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
   else:
     model_fn_ops = head.create_model_fn_ops(
         features=features,
@@ -529,26 +474,12 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features,
-          labels,
-          mode,
-          head,
-          dnn_hidden_units,
-          dnn_feature_columns,
-          tree_learner_config,
-          num_trees,
-          tree_examples_per_layer,
-          config,
-          dnn_optimizer,
-          dnn_activation_fn,
-          dnn_dropout,
-          dnn_input_layer_partitioner,
-          dnn_input_layer_to_tree,
-          dnn_steps_to_train,
-          tree_feature_columns,
-          tree_center_bias,
-          use_core_versions,
-          is_regression=True)
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
+          use_core_versions)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
         model_fn=_model_fn, model_dir=model_dir,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 70454aa6dbdb19297028a3f80822719bef5a0f72..89d0d611d2905492cec09e033b8cbc238ec7fac6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -40,7 +40,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
                label_keys=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeClassifier estimator instance.
 
     Args:
@@ -63,7 +64,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
-
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     Raises:
       ValueError: If learner_config is not valid.
     """
@@ -99,6 +101,7 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'center_bias': center_bias,
             'logits_modifier_function': logits_modifier_function,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -120,7 +123,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeRegressor estimator instance.
 
     Args:
@@ -145,6 +149,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -166,6 +172,7 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
@@ -189,7 +196,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
                config=None,
                feature_engineering_fn=None,
                logits_modifier_function=None,
-               center_bias=True):
+               center_bias=True,
+               use_core_libs=False):
     """Initializes a GradientBoostedDecisionTreeEstimator estimator instance.
 
     Args:
@@ -210,6 +218,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
     """
     super(GradientBoostedDecisionTreeEstimator, self).__init__(
         model_fn=model.model_builder,
@@ -222,6 +232,7 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
             'examples_per_layer': examples_per_layer,
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
         },
         model_dir=model_dir,
         config=config,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d58317bd59331cfcde0e12aeb3a3a03fc45d89b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GBDT estimator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+
+
+def _train_input_fn():
+  features = {"x": constant_op.constant([[2.], [1.], [1.]])}
+  label = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+  return features, label
+
+
+def _eval_input_fn():
+  features = {"x": constant_op.constant([[1.], [2.], [2.]])}
+  label = constant_op.constant([[0], [1], [1]], dtype=dtypes.int32)
+  return features, label
+
+
+class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowException(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[contrib_feature_column.real_valued_column("x")])
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForEstimator(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    # Use core head
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    model = estimator.GradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    model.fit(input_fn=_train_input_fn, steps=15)
+    model.evaluate(input_fn=_eval_input_fn, steps=1)
+    model.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForClassifier(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+    classifier.export(self._export_dir_base)
+
+  def testFitAndEvaluateDontThrowExceptionWithCoreForRegressor(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    regressor = estimator.GradientBoostedDecisionTreeRegressor(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")],
+        use_core_libs=True)
+
+    regressor.fit(input_fn=_train_input_fn, steps=15)
+    regressor.evaluate(input_fn=_eval_input_fn, steps=1)
+    regressor.export(self._export_dir_base)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a7f85eada8c72de83b814af2f00e97a62a073e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_utils.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for converting between core and contrib feature columns."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
+
+_CORE_MODE_TO_CONTRIB_MODE_ = {
+    model_fn_lib.ModeKeys.TRAIN: contrib_model_fn_lib.ModeKeys.TRAIN,
+    model_fn_lib.ModeKeys.EVAL: contrib_model_fn_lib.ModeKeys.EVAL,
+    model_fn_lib.ModeKeys.PREDICT: contrib_model_fn_lib.ModeKeys.INFER
+}
+
+
+def _core_mode_to_contrib_mode(mode):
+  return _CORE_MODE_TO_CONTRIB_MODE_[mode]
+
+
+def _export_outputs_to_output_alternatives(export_outputs):
+  """Converts EstimatorSpec.export_outputs to output_alternatives.
+
+  Args:
+    export_outputs: export_outputs created by create_estimator_spec.
+  Returns:
+    converted output_alternatives.
+  """
+  output = dict()
+  if export_outputs is not None:
+    for key, value in export_outputs.items():
+      if isinstance(value, export_output.ClassificationOutput):
+        exported_predictions = {
+            prediction_key.PredictionKey.SCORES: value.scores,
+            prediction_key.PredictionKey.CLASSES: value.classes
+        }
+        output[key] = (constants.ProblemType.CLASSIFICATION,
+                       exported_predictions)
+    return output
+  return None
+
+
+def estimator_spec_to_model_fn_ops(estimator_spec, export_alternatives=False):
+  if export_alternatives:
+    alternatives = _export_outputs_to_output_alternatives(
+        estimator_spec.export_outputs)
+  else:
+    alternatives = []
+
+  return model_fn.ModelFnOps(
+      mode=_core_mode_to_contrib_mode(estimator_spec.mode),
+      predictions=estimator_spec.predictions,
+      loss=estimator_spec.loss,
+      train_op=estimator_spec.train_op,
+      eval_metric_ops=estimator_spec.eval_metric_ops,
+      output_alternatives=alternatives)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index c6455a7ea3d18eb358edee034cee58b2bed21024..15ab6d814522ab1dee58dcd71246354fc4d8a483 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
@@ -60,6 +61,7 @@ def model_builder(features, labels, mode, params, config):
   feature_columns = params["feature_columns"]
   weight_column_name = params["weight_column_name"]
   num_trees = params["num_trees"]
+  use_core_libs = params["use_core_libs"]
   logits_modifier_function = params["logits_modifier_function"]
   if features is None:
     raise ValueError("At least one feature must be specified.")
@@ -93,7 +95,8 @@ def model_builder(features, labels, mode, params, config):
       learner_config=learner_config,
       feature_columns=feature_columns,
       logits_dimension=head.logits_dimension,
-      features=training_features)
+      features=training_features,
+      use_core_columns=use_core_libs)
   with ops.name_scope("gbdt", "gbdt_optimizer"):
     predictions_dict = gbdt_model.predict(mode)
     logits = predictions_dict["predictions"]
@@ -108,12 +111,22 @@ def model_builder(features, labels, mode, params, config):
         update_op = state_ops.assign_add(global_step, 1).op
         return update_op
 
-  model_fn_ops = head.create_model_fn_ops(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_train_op_fn,
-      logits=logits)
+  create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
+  if use_core_libs and callable(create_estimator_spec_op):
+    model_fn_ops = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
+  else:
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
   if num_trees:
     if center_bias:
       num_trees += 1
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 85b909e4f2556c520a5bffe46d5954683d9dda5a..4bde7f3e33d6f8b295cd35cb32bbbccecf8a2b87 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -23,7 +23,6 @@ import copy
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import stateless
-
 from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
 from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
@@ -141,7 +140,7 @@ class _OpRoundRobinStrategy(object):
     return task
 
 
-def extract_features(features, feature_columns):
+def extract_features(features, feature_columns, use_core_columns):
   """Extracts columns from a dictionary of features.
 
   Args:
@@ -174,7 +173,11 @@ def extract_features(features, feature_columns):
       transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
-        if isinstance(fc, feature_column_lib._EmbeddingColumn):
+        if use_core_columns:
+          # pylint: disable=protected-access
+          tensor = fc_core._transform_features(features, [fc])[fc]
+          transformed_features[fc.name] = tensor
+        elif isinstance(fc, feature_column_lib._EmbeddingColumn):
           # pylint: enable=protected-access
           transformed_features[fc.name] = fc_core.input_layer(
               features, [fc],
@@ -265,7 +268,8 @@ class GradientBoostedDecisionTreeModel(object):
                learner_config,
                features,
                logits_dimension,
-               feature_columns=None):
+               feature_columns=None,
+               use_core_columns=False):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -338,8 +342,9 @@ class GradientBoostedDecisionTreeModel(object):
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
-     sparse_float_shapes, sparse_int_indices, sparse_int_values,
-     sparse_int_shapes) = extract_features(features, self._feature_columns)
+     sparse_float_shapes, sparse_int_indices,
+     sparse_int_values, sparse_int_shapes) = extract_features(
+         features, self._feature_columns, use_core_columns)
     logging.info("Active Feature Columns: " + str(fc_names))
     self._fc_names = fc_names
     self._dense_floats = dense_floats
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 6411f57a5419123e799af9231a04fce8ae7724d4..17dcb49f476bc5e14cfd27f8cec843f48982b782 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -27,9 +27,11 @@ from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.boosted_trees.python.utils import losses
 
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -99,7 +101,8 @@ class GbdtTest(test_util.TensorFlowTestCase):
           array_ops.zeros([2], dtypes.int64))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(features, None))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(features, None, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_int"])
@@ -148,8 +151,9 @@ class GbdtTest(test_util.TensorFlowTestCase):
               "sparse_categorical", hash_bucket_size=1000000))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
-       sparse_int_shapes) = (gbdt_batch.extract_features(
-           features, feature_columns))
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=False))
       self.assertEqual(len(fc_names), 3)
       self.assertAllEqual(fc_names,
                           ["dense_float", "sparse_float", "sparse_categorical"])
@@ -174,6 +178,41 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sparse_int_shapes[0].eval(),
                           features["sparse_categorical"].dense_shape.eval())
 
+  def testExtractFeaturesFromCoreFeatureColumns(self):
+    """Tests feature extraction when using core columns."""
+    with self.test_session():
+      features = {}
+      # Sparse float column does not exist in core, so only dense numeric and
+      # categorical.
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_categorical"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.string), array_ops.zeros([2],
+                                                               dtypes.int64))
+
+      feature_columns = set()
+      feature_columns.add(core_feature_column.numeric_column("dense_float"))
+      feature_columns.add(
+          core_feature_column.categorical_column_with_hash_bucket(
+              "sparse_categorical", hash_bucket_size=1000000))
+      (fc_names, dense_floats, _, _, _, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (
+           gbdt_batch.extract_features(
+               features, feature_columns, use_core_columns=True))
+      self.assertEqual(len(fc_names), 2)
+      self.assertAllEqual(fc_names, ["dense_float", "sparse_categorical"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_categorical"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_categorical"].dense_shape.eval())
+
   def testTrainFnChiefNoBiasCentering(self):
     """Tests the train function running on chief without bias centering."""
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 23b31ae1dcc83d8a7152354ac147de9ada320429..a7944ea74ae5eaf81d468b6efbdf4f7567cb9493 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -124,7 +124,7 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 1fefb731a775d9cd2478cbb654662ec6ba673fed..35c2a294ecfa51516dcd3922b8a99b1b365de112 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2)
+set(GRPC_TAG 09386db3939cae1ac12e5f09b735adfa8958c68e)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 6cd66a65990e7a2b963b52b310061b551752cd4d..ad2af01bc002555ce48f8b9bfb7d8d724a1a7dc8 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -15,32 +15,33 @@
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://storage.googleapis.com/libpng-public-archive/libpng-1.2.53.tar.gz)
-set(png_HASH SHA256=e05c9056d7f323088fd7824d8c6acc03a4a758c4b4916715924edc5dd3223a72)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz)
+set(png_HASH SHA256=e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(png_STATIC_LIBRARIES 
-      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
   else()
     if(CMAKE_BUILD_TYPE EQUAL Debug)
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib)
     else()
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
     endif()
   endif()
 else()
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
+  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng16.a)
 endif()
 
 set(png_HEADERS
-    "${png_INSTALL}/include/libpng12/png.h"
-    "${png_INSTALL}/include/libpng12/pngconf.h"
+    "${png_INSTALL}/include/libpng16/png.h"
+    "${png_INSTALL}/include/libpng16/pngconf.h"
+    "${png_INSTALL}/include/libpng16/pnglibconf.h"
 )
 
 ExternalProject_Add(png
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 57c4ae76517e4d7247093edd5e5bd95a83258d87..7f835d2d519273a6d52d12f92ed585a4ddbeb973 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
-set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip)
+set(sqlite_HASH SHA256=4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index f273c7e5508e10407d013acd7adc08c732322841..91839194c7c214fe910ff78723ab418f86c7fac0 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -104,6 +104,8 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
+tensorflow/tools/api
+tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
@@ -168,7 +170,6 @@ tensorflow/contrib/distributions/python
 tensorflow/contrib/distributions/python/ops
 tensorflow/contrib/distributions/python/ops/bijectors
 tensorflow/contrib/eager
-tensorflow/contrib/eager/proto
 tensorflow/contrib/eager/python
 tensorflow/contrib/estimator
 tensorflow/contrib/estimator/python
@@ -355,6 +356,9 @@ tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
+tensorflow/contrib/proto
+tensorflow/contrib/proto/python
+tensorflow/contrib/proto/python/ops
 tensorflow/contrib/quantization
 tensorflow/contrib/quantization/python
 tensorflow/contrib/quantize
@@ -363,6 +367,10 @@ tensorflow/contrib/receptive_field
 tensorflow/contrib/receptive_field/python
 tensorflow/contrib/receptive_field/python/util
 tensorflow/contrib/receptive_field/python/util/examples
+tensorflow/contrib/recurrent
+tensorflow/contrib/recurrent/python
+tensorflow/contrib/recurrent/python/ops
+tensorflow/contrib/recurrent/python/kernel_tests
 tensorflow/contrib/reduce_slice_ops
 tensorflow/contrib/reduce_slice_ops/kernels
 tensorflow/contrib/reduce_slice_ops/ops
@@ -383,6 +391,9 @@ tensorflow/contrib/rnn/ops
 tensorflow/contrib/rnn/python
 tensorflow/contrib/rnn/python/kernel_tests
 tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/rpc
+tensorflow/contrib/rpc/python
+tensorflow/contrib/rpc/python/ops
 tensorflow/contrib/saved_model
 tensorflow/contrib/saved_model/python
 tensorflow/contrib/saved_model/python/saved_model
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 0c80d529af5230ed6d36b265e12ee4b749a14ec4..d63c41db844af243f0c6600b1565635ac9b91cac 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -5,7 +5,6 @@ tensorflow/python
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/decision_trees/proto
-tensorflow/contrib/eager/proto
 tensorflow/contrib/gdr
 tensorflow/contrib/lite/toco
 tensorflow/contrib/mpi
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347fe60f87806736befc677541a93e7e93..f7cb186c7ca05fad1294439afaa6f8c43608600d 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
@@ -341,9 +341,3 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
-
-if(WIN32)
-  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
-  # Instead of defining this global, limit it to tf_core_framework where its used.
-  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
-endif()
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 092a48bc6b63503be39343a1f936875082490b3e..e558691de4b74988031f7b2204aad92e8c7af68b 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -25,6 +25,8 @@ set(tf_op_lib_names
     "cudnn_rnn_ops"
     "data_flow_ops"
     "dataset_ops"
+    "decode_proto_ops"
+    "encode_proto_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
@@ -40,6 +42,7 @@ set(tf_op_lib_names
     "random_ops"
     "remote_fused_graph_ops"
     "resource_variable_ops"
+    "rpc_ops"
     "script_ops"
     "sdca_ops"
     "set_ops"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index fae45ead5cafcb0f55834af223555f6e65f16015..1c3206f1a26d457a6bea4757df087da5af1bdc2b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -330,6 +330,8 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops")
+GENERATE_PYTHON_OP_LIB("encode_proto_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
@@ -343,6 +345,7 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
@@ -583,6 +586,12 @@ add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_deffile}
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(pywrap_tensorflow_internal PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
     add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
 endif(WIN32)
@@ -686,6 +695,77 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
+
+
+########################################################
+# Generate API __init__.py files.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
+file(WRITE "${api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
+
+      # Re-add tensorflow/__init__.py back.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
+
+add_custom_target(tf_python_api SOURCES ${api_init_files})
+add_dependencies(tf_python_api tf_python_ops)
+
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -695,6 +775,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
+    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -707,25 +788,6 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 91ca33f4c4d5f6c822f45b0676e6e46d2e4c2860..af48ef1fd40456162fee8b1e2c3ca45ecdb58830 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -65,6 +65,12 @@ if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
+    if (NOT tensorflow_BUILD_CC_TESTS)
+        file(GLOB tf_stream_executor_gpu_tests
+            "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+        )
+        list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index ce12e38248785987e51befa47d04143e235554fe..9ca4ce8a9c765677865f77ea4982ad8613ce334c 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -92,6 +92,34 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "pmf_to_cdf_op",
+    srcs = ["kernels/pmf_to_cdf_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":coder_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "pmf_to_cdf_op_test",
+    size = "small",
+    srcs = ["kernels/pmf_to_cdf_op_test.cc"],
+    deps = [
+        ":pmf_to_cdf_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 cc_library(
     name = "all_ops",
     deps = [":coder_ops_op_lib"],
@@ -99,12 +127,16 @@ cc_library(
 
 cc_library(
     name = "all_kernels",
-    deps = [":range_coder_ops"],
+    deps = [
+        ":pmf_to_cdf_op",
+        ":range_coder_ops",
+    ],
 )
 
 tf_custom_op_library(
     name = "python/ops/_coder_ops.so",
     srcs = [
+        "kernels/pmf_to_cdf_op.cc",
         "kernels/range_coder.cc",
         "kernels/range_coder.h",
         "kernels/range_coder_ops.cc",
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c787e8edede0942cd152eafa6333849d194e58b6
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+using errors::InvalidArgument;
+
+class PmfToCdfOp : public OpKernel {
+ public:
+  explicit PmfToCdfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("precision", &precision_));
+    OP_REQUIRES(
+        context, 0 < precision_ && precision_ <= 16,
+        InvalidArgument("`precision` must be in [1, 16]: ", precision_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& pmf_tensor = context->input(0);
+
+    TensorShape shape = pmf_tensor.shape();
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(shape),
+                InvalidArgument("`pmf` should be at least 1-D."));
+    OP_REQUIRES(
+        context, shape.dim_size(shape.dims() - 1) > 1,
+        InvalidArgument("`pmf` size should be at least 2 in the last axis."));
+    shape.set_dim(shape.dims() - 1, shape.dim_size(shape.dims() - 1) + 1);
+
+    Tensor* cdf_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &cdf_tensor));
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor->flat_inner_dims<int32, 2>();
+    CHECK_EQ(pmf.dimension(0), cdf.dimension(0));
+    CHECK_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const double n = pmf.dimension(1);
+    const int64 cost_per_unit = static_cast<int64>(50.0 * n * std::log2(n));
+    thread::ThreadPool* thread_pool =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    thread_pool->ParallelFor(
+        pmf.dimension(0), cost_per_unit,
+        [this, pmf, &cdf](int64 start, int64 limit) {
+          const gtl::ArraySlice<float>::size_type pmf_size = pmf.dimension(1);
+          for (int64 i = start; i < limit; ++i) {
+            cdf(i, 0) = 0;
+            PerShard({&pmf(i, 0), pmf_size}, {&cdf(i, 1), pmf_size});
+          }
+        });
+  }
+
+ private:
+  struct Item {
+    Item(int32* p, double mass) : pointer(p), mass(mass) {
+      penalty = ComputeNextPenalty();
+    }
+
+    void Decrease() {
+      CHECK_GT(*pointer, 1);
+      --*pointer;
+      penalty = ComputeNextPenalty();
+    }
+
+    friend bool operator<(const Item& lhs, const Item& rhs) {
+      return lhs.penalty < rhs.penalty;
+    }
+
+    double ComputeNextPenalty() {
+      if (*pointer <= 1) {
+        return std::numeric_limits<double>::infinity();
+      }
+      return mass * (std::log2(*pointer) - std::log2(*pointer - 1));
+    }
+
+    int32* pointer;
+    double mass;
+    double penalty;
+  };
+
+  void PerShard(gtl::ArraySlice<float> pmf,
+                gtl::MutableArraySlice<int32> cdf) const {
+    CHECK_EQ(pmf.size(), cdf.size());
+
+    const int32 normalizer = 1 << precision_;
+    std::transform(pmf.begin(), pmf.end(), cdf.begin(),
+                   [normalizer](float mass) {
+                     int32 value = std::rint(mass * normalizer);
+                     // NOTE: Consider checking if mass > 0.
+                     value = std::max(value, 1);
+                     return value;
+                   });
+
+    int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0);
+    if (sum > normalizer) {
+      std::vector<Item> queue;
+      queue.reserve(cdf.size());
+      for (int i = 0; i < cdf.size(); ++i) {
+        queue.emplace_back(&cdf[i], pmf[i]);
+      }
+
+      std::sort(queue.begin(), queue.end());
+      while (sum-- > normalizer) {
+        queue[0].Decrease();
+        // Performs a linear search because this find_if is likely to return
+        // iterator very close to the begin.
+        auto iter =
+            std::find_if(std::next(queue.begin()), queue.end(),
+                         [&queue](const Item& rhs) { return queue[0] < rhs; });
+        std::rotate(queue.begin(), std::next(queue.begin()), iter);
+      }
+    }
+    std::partial_sum(cdf.begin(), cdf.end(), cdf.begin());
+  }
+
+  int precision_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("PmfToQuantizedCdf").Device(DEVICE_CPU),
+                        PmfToCdfOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c70e38faab713e23b5defa890d35bfadeac5940a
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+class PmfToQuantizedCdfOpTest : public OpsTestBase {
+ protected:
+  void SetupOp(int precision, Tensor* input) {
+    TF_ASSERT_OK(NodeDefBuilder("pmf_to_cdf", "PmfToQuantizedCdf")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("precision", precision)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    inputs_.clear();
+    inputs_.emplace_back(input);
+  }
+
+  void GenerateData(random::SimplePhilox* rand,
+                    gtl::MutableArraySlice<float> slice) {
+    constexpr float minimum = std::numeric_limits<float>::epsilon();
+    float sum = 0;
+    for (float& value : slice) {
+      value = std::max(rand->RandFloat(), minimum);
+      sum += value;
+    }
+    for (float& value : slice) {
+      value /= sum;
+    }
+  }
+
+  void Verify(int precision, const Tensor& pmf_tensor,
+              const Tensor& cdf_tensor) {
+    ASSERT_EQ(pmf_tensor.dims(), cdf_tensor.dims());
+    const int n = pmf_tensor.dims();
+
+    for (int i = 0; i < n - 1; ++i) {
+      EXPECT_EQ(pmf_tensor.dim_size(i), cdf_tensor.dim_size(i));
+    }
+
+    auto pmf = pmf_tensor.flat_inner_dims<float, 2>();
+    auto cdf = cdf_tensor.flat_inner_dims<int32, 2>();
+    EXPECT_EQ(pmf.dimension(1) + 1, cdf.dimension(1));
+
+    const int normalizer = 1 << precision;
+    for (int i = 0; i < pmf.dimension(0); ++i) {
+      EXPECT_EQ(0, cdf(i, 0));
+
+      TTypes<int32>::UnalignedConstVec cdf_slice(&cdf(i, 0), cdf.dimension(1));
+
+      for (int j = 1; j < cdf_slice.size(); ++j) {
+        const int32 diff = cdf_slice(j) - cdf_slice(j - 1);
+        EXPECT_GT(diff, 0);
+      }
+
+      EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer);
+    }
+  }
+};
+
+TEST_F(PmfToQuantizedCdfOpTest, UnderSum) {
+  Tensor pmf(DT_FLOAT, {1, 10, 1, 32});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+  const std::size_t n = matrix.dimension(1);
+
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  constexpr int kPrecision = 10;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, OverSum) {
+  Tensor pmf(DT_FLOAT, {10, 1, 1, 100});
+  auto matrix = pmf.flat_inner_dims<float, 2>();
+
+  // Half of each PMF is filled with zeros. The op will round up zeros to ones,
+  // post quantization. These round ups are likely to make the sum over
+  // normalizer value.
+  matrix.setZero();
+  const std::size_t n = matrix.dimension(1) / 2;
+
+  random::PhiloxRandom gen;
+  random::SimplePhilox rand(&gen);
+  for (int64 i = 0; i < matrix.dimension(0); ++i) {
+    GenerateData(&rand, {&matrix(i, 0), n});
+  }
+
+  constexpr int kPrecision = 7;
+  SetupOp(kPrecision, &pmf);
+  TF_ASSERT_OK(RunOpKernel());
+
+  Verify(kPrecision, pmf, *GetOutput(0));
+}
+
+TEST_F(PmfToQuantizedCdfOpTest, ShapeFn) {
+  ShapeInferenceTestOp op("PmfToQuantizedCdf");
+
+  INFER_OK(op, "?", "?");
+  INFER_OK(op, "[3]", "[4]");
+  INFER_OK(op, "[3,4]", "[d0_0,5]");
+  INFER_OK(op, "[3,4,5]", "[d0_0,d0_1,6]");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
index 9056d1a6963d7be92f499db31385fb6afe2dc515..9bb171298f85088fdb776302776f2ba379b4f52e 100644
--- a/tensorflow/contrib/coder/ops/coder_ops.cc
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -115,5 +116,36 @@ decoded: An int32 tensor with shape equal to `shape`.
 precision: The number of bits for probability quantization. Must be <= 16, and
   must match the precision used by RangeEncode that produced `encoded`.
 )doc");
+
+REGISTER_OP("PmfToQuantizedCdf")
+    .Input("pmf: float")
+    .Output("cdf: int32")
+    .Attr("precision: int >= 1")
+    .SetShapeFn([] (InferenceContext* c) {
+      ShapeHandle in;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &in));
+      DimensionHandle last;
+      TF_RETURN_IF_ERROR(c->Add(c->Dim(in, -1), 1, &last));
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(in, -1, last, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Converts PMF to quantized CDF. This op uses floating-point operations
+internally. Therefore the quantized output may not be consistent across multiple
+platforms. For entropy encoders and decoders to have the same quantized CDF on
+different platforms, the quantized CDF should be produced once and saved, then
+the saved quantized CDF should be used everywhere.
+
+After quantization, if PMF sums to less than or equal to 2^precision, then this
+is equivalent to cumsum over the last dimension. This op makes no effort to make
+the sum close to 2^precision when the sum is already <= 2^precision.
+
+After quantization, if PMF sums to greater than 2^precision, then some values of
+PMF is decreased to keep the sum no more than 2^precision.
+
+Note that the input PMF is pre-quantization.
+)doc");
 // clang-format on
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 721dc4d0801d1f0e116921888e3851a95e0b72b0..a5e065b93a23c3dd2838d81e7cf537dec226f4f9 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -281,6 +281,21 @@ class CrfTest(test.TestCase):
         self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
                          expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tags, scores = sess.run(values)
+      self.assertEqual(len(tags.shape), 2)
+      self.assertEqual(len(scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 1233c8f251c404c57d9e2b38993e7a386b1e6ceb..e37c029cebf30eba59c560bc00ed73d2eea86213 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -479,15 +479,17 @@ def crf_decode(potentials, transition_params, sequence_length):
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # sequence length is not allowed to be less than zero
+    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
     backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
-        backpointers, sequence_length - 1, seq_dim=1)
+        backpointers, sequence_length_less_one, seq_dim=1)
 
     # Computes backward decoding. Extract tag indices from backpointers.
     crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
@@ -497,7 +499,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
         crf_bwd_cell,
         inputs=backpointers,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9cc6ca09ad6c58a6cdc5909ec755ccdd49424872..6fb56b0858786662546ecab425b1a2564fbd9a64 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -29,7 +29,6 @@ import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -55,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 9796aae4b01e9efc6c9a08ab803bda4ee614abf2..c28c3a18e40d85c054e3dd8603fc997c775ccc5a 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
@@ -524,10 +524,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
 
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
@@ -648,10 +645,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleGRUCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -730,11 +724,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
 class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
   """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(
-      rnn_cell_impl.BasicRNNCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 7bb0dc1c0f695f4d1c7739fa11764ded4ff9410a..8bdbba83ef6a8541158d956e36caf6a9be435c5b 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -22,13 +22,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
-        "//tensorflow/contrib/data/python/ops:prefetching_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/contrib/data/python/ops:shuffle_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 125260b4c1f6b63c8f83f28d1829afe2d9d3ea97..637b1dc46cbbfeaed02ab4273d0365bb8ee3fcba 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -80,8 +80,6 @@ from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.contrib.data.python.ops.sliding import sliding_window_batch
-from tensorflow.python.data.ops.iterator_ops import Iterator
-from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_single_example
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 7270d533c69002ad6b318645f1ef07ebb45a85c3..b475c9fa6b1c1163df3b567d54889e4144abf719 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -8,14 +8,13 @@ load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -38,8 +37,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -60,10 +58,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -80,8 +78,7 @@ py_test(
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -128,13 +125,13 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -146,7 +143,7 @@ tf_py_test(
     additional_deps = [
         ":dataset_serialization_test",
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -176,8 +173,7 @@ py_test(
     ],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -188,6 +184,7 @@ py_test(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -198,7 +195,8 @@ tf_py_test(
     srcs = ["get_single_element_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:get_single_element",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -216,8 +214,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -262,8 +259,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:counter",
+        "//tensorflow/contrib/data/python/ops:enumerate_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -275,6 +272,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -310,12 +308,12 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["noasan"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -328,7 +326,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -347,11 +345,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -379,7 +377,6 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -416,10 +413,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:stats_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -430,10 +427,11 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -445,13 +443,13 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/stateless",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -464,11 +462,11 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -498,8 +496,8 @@ tf_py_test(
     size = "small",
     srcs = ["slide_dataset_op_test.py"],
     additional_deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:sliding",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index f3e93024092da7c99d4e529f0bd6a6eec06068bf..1075302bae96ca2e0111efbacdf5e919ea76897d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -295,18 +295,20 @@ class ReadBatchFeaturesTest(test.TestCase):
         ).get_next()
 
   def _record(self, f, r):
-    example = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            "file":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[f])),
-            "record":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[r])),
-            "keywords":
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=self._get_keywords(f, r)))
-        }))
+    example = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                "file":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[f])),
+                "record":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[r])),
+                "keywords":
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=self._get_keywords(f, r)))
+            }))
     return example.SerializeToString()
 
   def _get_keywords(self, f, r):
@@ -374,8 +376,8 @@ class ReadBatchFeaturesTest(test.TestCase):
         record_batch.append(r)
         keywords = self._get_keywords(f, r)
         keywords_batch_values.extend(keywords)
-        keywords_batch_indices.extend([[batch_index, i]
-                                       for i in range(len(keywords))])
+        keywords_batch_indices.extend(
+            [[batch_index, i] for i in range(len(keywords))])
         batch_index += 1
         keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
         if len(file_batch) == batch_size:
@@ -475,9 +477,10 @@ class ReadBatchFeaturesTest(test.TestCase):
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (core_readers.TFRecordDataset(self.test_filenames)
-               .map(lambda x: parsing_ops.parse_single_example(x, features))
-               .repeat(10).batch(2))
+    dataset = (
+        core_readers.TFRecordDataset(self.test_filenames)
+        .map(lambda x: parsing_ops.parse_single_example(x, features))
+        .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     next_element = iterator.get_next()
@@ -607,20 +610,25 @@ class MakeCsvDatasetTest(test.TestCase):
         "record %d" % recordno if recordno % 2 == 1 else "",
     ]
 
-  def _csv_record(self, fileno, recordno):
-    return ",".join(str(v) for v in self._csv_values(fileno, recordno))
+  def _write_file(self, filename, rows):
+    for i in range(len(rows)):
+      if isinstance(rows[i], list):
+        rows[i] = ",".join(str(v) if v is not None else "" for v in rows[i])
+    fn = os.path.join(self.get_temp_dir(), filename)
+    f = open(fn, "w")
+    f.write("\n".join(rows))
+    f.close()
+    return fn
 
   def _create_file(self, fileno, header=True, comment=True):
-    fn = os.path.join(self.get_temp_dir(), "csv_file%d.csv" % fileno)
-    f = open(fn, "w")
+    rows = []
     if header:
-      f.write(",".join(self.COLUMNS) + "\n")
+      rows.append(self.COLUMNS)
     for recno in range(self._num_records):
-      f.write(self._csv_record(fileno, recno) + "\n")
+      rows.append(self._csv_values(fileno, recno))
       if comment:
-        f.write("# Some comment goes here. Should be ignored!\n")
-    f.close()
-    return fn
+        rows.append("# Some comment goes here. Ignore me.")
+    return self._write_file("csv_file%d.csv" % fileno, rows)
 
   def _create_files(self):
     filenames = []
@@ -634,6 +642,7 @@ class MakeCsvDatasetTest(test.TestCase):
       defaults,
       column_names=COLUMNS,
       label_name=LABEL,
+      select_cols=None,
       batch_size=1,
       num_epochs=1,
       shuffle=False,
@@ -656,6 +665,7 @@ class MakeCsvDatasetTest(test.TestCase):
         comment=comment,
         na_value=na_value,
         default_float_type=default_float_type,
+        select_columns=select_cols,
     )
 
   def _next_actual_batch(self, file_indices, batch_size, num_epochs, defaults):
@@ -712,7 +722,7 @@ class MakeCsvDatasetTest(test.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
-  def test_make_csv_dataset(self):
+  def testMakeCSVDataset(self):
     defaults = self.DEFAULTS
 
     with ops.Graph().as_default() as g:
@@ -739,7 +749,7 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_with_bad_columns(self):
+  def testMakeCSVDataset_withBadColumns(self):
     """Tests that exception is raised when input is malformed.
     """
     dupe_columns = self.COLUMNS[:-1] + self.COLUMNS[:1]
@@ -755,7 +765,7 @@ class MakeCsvDatasetTest(test.TestCase):
       self._make_csv_dataset(
           self._test_filenames, defaults, label_name="not_a_real_label")
 
-  def test_make_csv_dataset_with_no_label(self):
+  def testMakeCSVDataset_withNoLabel(self):
     """Tests that CSV datasets can be created when no label is specified.
     """
     defaults = self.DEFAULTS
@@ -776,7 +786,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
             label_name=None)
 
-  def test_make_csv_dataset_with_no_comments(self):
+  def testMakeCSVDataset_withNoComments(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -799,7 +809,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_no_header(self):
+  def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -822,7 +832,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_types(self):
+  def testMakeCSVDataset_withTypes(self):
     """Tests that defaults can be a dtype instead of a Tensor for required vals.
     """
     defaults = [d for d in self.COLUMN_TYPES[:-1]]
@@ -832,7 +842,7 @@ class MakeCsvDatasetTest(test.TestCase):
         dataset = self._make_csv_dataset(self._test_filenames, defaults)
         self._verify_records(sess, dataset, range(self._num_files))
 
-  def test_make_csv_dataset_with_no_col_names(self):
+  def testMakeCSVDataset_withNoColNames(self):
     """Tests that datasets can be created when column names are not specified.
 
     In that case, we should infer the column names from the header lines.
@@ -851,7 +861,17 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_type_inference(self):
+  def testMakeCSVDataset_withTypeInferenceMismatch(self):
+    # Test that error is thrown when num fields doesn't match columns
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames,
+          column_names=self.COLUMNS + ["extra_name"],
+          defaults=None,
+          batch_size=2,
+          num_epochs=10)
+
+  def testMakeCSVDataset_withTypeInference(self):
     """Tests that datasets can be created when no defaults are specified.
 
     In that case, we should infer the types from the first N records.
@@ -875,19 +895,16 @@ class MakeCsvDatasetTest(test.TestCase):
         dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
         dtypes.string, dtypes.string
     ]
-    rows = [[0, 0, 0, "NAN", "", "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[None, None, None, "NAN", "",
+             "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
             ['"123"', 2, 2**64, 123.4, "NAN", '"cd,efg"']]
     expected = [[0, 0, 0, 0, "", "a"], [1, 2**31 + 1, 2**64, 123, "", ""],
                 [123, 2, 2**64, 123.4, "", "cd,efg"]]
     for row in expected:
       row[-1] = row[-1].encode("utf-8")  # py3 expects byte strings
       row[-2] = row[-2].encode("utf-8")  # py3 expects byte strings
-    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
-    with open(fn, "w") as f:
-      f.write(",".join(col_names))
-      f.write("\n")
-      for row in rows:
-        f.write(",".join([str(v) if v else "" for v in row]) + "\n")
+    self._write_file("file.csv", [col_names] + rows)
 
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
@@ -895,8 +912,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float32,
@@ -919,8 +934,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float64,
@@ -928,11 +941,99 @@ class MakeCsvDatasetTest(test.TestCase):
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
         for i in range(len(expected_dtypes)):
-          assert features["col%d" % i].dtype == expected_dtypes[i]
+          self.assertAllEqual(features["col%d" % i].dtype, expected_dtypes[i])
         for i in range(len(rows)):
-          assert sess.run(features) == dict(zip(col_names, expected[i]))
+          self.assertAllEqual(
+              sess.run(features), dict(zip(col_names, expected[i])))
+
+  def testMakeCSVDataset_withSelectColsError(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    with self.assertRaises(ValueError):
+      # Mismatch in number of defaults and number of columns selected,
+      # should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]] * 5,
+          column_names=col_names,
+          label_name=None,
+          select_cols=[1, 3])
+    with self.assertRaises(ValueError):
+      # Invalid column name should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]],
+          column_names=col_names,
+          label_name=None,
+          select_cols=["invalid_col_name"])
+
+  def testMakeCSVDataset_withSelectCols(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    # If select_cols is specified, should only yield a subset of columns
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=[[0], [0]],
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do default inference with select_cols
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do column name inference
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can specify column names instead of indices
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[col_names[1], col_names[3]])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
 
-  def test_make_csv_dataset_with_shuffle(self):
+  def testMakeCSVDataset_withShuffle(self):
     total_records = self._num_files * self._num_records
     defaults = self.DEFAULTS
     for batch_size in [1, 2]:
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 07bdf920446e953c2a1abaf495d2e9e1256106fd..c3a7f291c59a72dc6057f7e1c51d5ac78334176b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -218,6 +218,14 @@ class StatsDatasetSerializationTest(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  def test_bytes_produced_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.bytes_produced_stats(["bytes_produced"])),
+          None, 100)
+
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(
@@ -235,6 +243,14 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  def test_latency_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.latency_stats(["record_latency", "record_latency_2"])),
+          None, 100)
+
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index a1a5c9ed05ff226086885e4e204875d3ca933590..0e4590829b19ce00d141a16247a99b03d0120447 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -12,18 +12,26 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 py_library(
-    name = "dataset_ops",
-    srcs = [
-        "counter.py",
-        "get_single_element.py",
+    name = "counter",
+    srcs = ["counter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":scan_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+)
+
+py_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":transformation_ops",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
@@ -66,7 +74,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":dataset_ops",
+        ":batching",
+        ":interleave_ops",
         ":shuffle_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
@@ -94,51 +103,170 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":random_ops",
-        ":transformation_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_library(
-    name = "transformation_ops",
-    srcs = [
-        "batching.py",
-        "enumerate_ops.py",
-        "error_ops.py",
-        "grouping.py",
-        "interleave_ops.py",
-        "resampling.py",
-        "scan_ops.py",
-        "sliding.py",
-        "stats_ops.py",
-        "threadpool.py",
-        "unique.py",
+    name = "batching",
+    srcs = ["batching.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
+)
+
+py_library(
+    name = "enumerate_ops",
+    srcs = ["enumerate_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "error_ops",
+    srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
-        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "grouping",
+    srcs = ["grouping.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "interleave_ops",
+    srcs = ["interleave_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+py_library(
+    name = "resampling",
+    srcs = ["resampling.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batching",
+        ":scan_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "scan_ops",
+    srcs = ["scan_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "sliding",
+    srcs = ["sliding.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
+    name = "threadpool",
+    srcs = ["threadpool.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
-        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "unique",
+    srcs = [
+        "unique.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
@@ -184,3 +312,29 @@ py_library(
         "//tensorflow/python/data/util:sparse",
     ],
 )
+
+py_library(
+    name = "dataset_ops",
+    deps = [
+        ":batching",
+        ":counter",
+        ":enumerate_ops",
+        ":error_ops",
+        ":get_single_element",
+        ":grouping",
+        ":interleave_ops",
+        ":prefetching_ops",
+        ":readers",
+        ":resampling",
+        ":scan_ops",
+        ":shuffle_ops",
+        ":sliding",
+        ":stats_ops",
+        ":threadpool",
+        ":unique",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 36591c055ae8f2c54981525ffcc3df128a990a61..0531f9cbb9da6e6df85fa46940ab1661ad742eb4 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -108,7 +108,7 @@ def bucket_by_sequence_length(element_length_func,
   fraction of padding in a batch which increases training step efficiency.
 
   Args:
-    element_length_func: function from element in `Dataset` to `tf.int64`,
+    element_length_func: function from element in `Dataset` to `tf.int32`,
       determines the length of the element, which will determine the bucket it
       goes into.
     bucket_boundaries: `list<int>`, upper length boundaries of the buckets.
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index b8eb09978e490d706595f18525f424548679ae64..4ec8ae1c79d1eb99c56b31c6a0709a84c38f5f90 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -124,18 +124,21 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
                            na_value, header, comment, float_dtype,
-                           rows_for_inference):
+                           num_rows_for_inference, select_columns):
   """Infers column types from the first N valid CSV records of files."""
-  inferred_types = [None] * num_cols
+  if select_columns is None:
+    select_columns = range(num_cols)
+  inferred_types = [None] * len(select_columns)
 
-  for rows_read, csv_row in enumerate(
+  for i, csv_row in enumerate(
       _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
                     comment)):
-    if rows_for_inference is not None and rows_read >= rows_for_inference:
+    if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
-    for i, str_val in enumerate(csv_row):
-      inferred_types[i] = _infer_type(str_val, na_value, inferred_types[i],
-                                      float_dtype)
+
+    for j, col_index in enumerate(select_columns):
+      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
+                                      inferred_types[j], float_dtype)
 
   # Replace None's with a default type
   inferred_types = [t or dtypes.string for t in inferred_types]
@@ -162,12 +165,37 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
   return column_names
 
 
+def _get_sorted_col_indices(select_columns, column_names):
+  """Transforms select_columns argument into sorted column indices."""
+  names_to_indices = {n: i for i, n in enumerate(column_names)}
+  num_cols = len(column_names)
+  for i, v in enumerate(select_columns):
+    if isinstance(v, int):
+      if v < 0 or v >= num_cols:
+        raise ValueError(
+            "Column index %d specified in select_columns out of valid range." %
+            v)
+      continue
+    if v not in names_to_indices:
+      raise ValueError(
+          "Value '%s' specified in select_columns not a valid column index or "
+          "name." % v)
+    select_columns[i] = names_to_indices[v]
+
+  # Sort and ensure there are no duplicates
+  result = sorted(set(select_columns))
+  if len(result) != len(select_columns):
+    raise ValueError("select_columns contains duplicate columns")
+  return result
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
     column_names=None,
     column_defaults=None,
     label_name=None,
+    select_columns=None,
     field_delim=",",
     use_quote_delim=True,
     na_value="",
@@ -201,20 +229,32 @@ def make_csv_dataset(
       provided, infers the column names from the first row of the records.
       These names will be the keys of the features dict of each dataset element.
     column_defaults: A optional list of default values for the CSV fields. One
-      item per column of the input record. Each item in the list is either a
-      valid CSV dtype (float32, float64, int32, int64, or string), or a
+      item per selected column of the input record. Each item in the list is
+      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
       `Tensor` with one of the aforementioned types. The tensor can either be
       a scalar default value (if the column is optional), or an empty tensor (if
       the column is required). If a dtype is provided instead of a tensor, the
       column is also treated as required. If this list is not provided, tries
       to infer types based on reading the first num_rows_for_inference rows of
       files specified, and assumes all columns are optional, defaulting to `0`
-      for numeric values and `""` for string values.
+      for numeric values and `""` for string values. If both this and
+      `select_columns` are specified, these must have the same lengths, and
+      `column_defaults` is assumed to be sorted in order of increasing column
+      index.
     label_name: A optional string corresponding to the label column. If
       provided, the data for this column is returned as a separate `Tensor` from
       the features dictionary, so that the dataset complies with the format
       expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
       function.
+    select_columns: An optional list of integer indices or string column
+      names, that specifies a subset of columns of CSV data to select. If
+      column names are provided, these must correspond to names provided in
+      `column_names` or inferred from the file header lines. When this argument
+      is specified, only a subset of CSV columns will be parsed and returned,
+      corresponding to the columns specified. Using this results in faster
+      parsing and lower memory usage. If both this and `column_defaults` are
+      specified, these must have the same lengths, and `column_defaults` is
+      assumed to be sorted in order of increasing column index.
     field_delim: An optional `string`. Defaults to `","`. Char delimiter to
       separate fields in a record.
     use_quote_delim: An optional bool. Defaults to `True`. If false, treats
@@ -279,6 +319,9 @@ def make_csv_dataset(
   if len(column_names) != len(set(column_names)):
     raise ValueError("Cannot have duplicate column names.")
 
+  if select_columns is not None:
+    select_columns = _get_sorted_col_indices(select_columns, column_names)
+
   if column_defaults is not None:
     column_defaults = [
         constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
@@ -289,7 +332,17 @@ def make_csv_dataset(
     # construction time
     column_defaults = _infer_column_defaults(
         filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, comment, default_float_type, num_rows_for_inference)
+        header, comment, default_float_type, num_rows_for_inference,
+        select_columns)
+
+  if select_columns is not None and len(column_defaults) != len(select_columns):
+    raise ValueError(
+        "If specified, column_defaults and select_columns must have same "
+        "length."
+    )
+  if select_columns is not None and len(column_names) > len(select_columns):
+    # Pick the relevant subset of column names
+    column_names = [column_names[i] for i in select_columns]
 
   if label_name is not None and label_name not in column_names:
     raise ValueError("`label_name` provided must be one of the columns.")
@@ -322,6 +375,7 @@ def make_csv_dataset(
         field_delim=field_delim,
         use_quote_delim=use_quote_delim,
         na_value=na_value,
+        select_cols=select_columns,
     )
     features = dict(zip(column_names, columns))
     if label_name is not None:
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 78b2b0054aa95701ad192b4fb9a0727ce287de4b..5aad21cccd3ca3115c5156150b6ba46fe7b25dd2 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -22,11 +22,13 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":prefetching_ops_v2",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
@@ -51,6 +53,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:model_fn",
     ],
@@ -66,6 +69,8 @@ py_library(
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device",
+        "//tensorflow/python:device_util",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
@@ -84,9 +89,9 @@ py_library(
         ":values",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -104,6 +109,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
@@ -156,8 +162,8 @@ py_test(
     deps = [
         ":mirrored_strategy",
         ":strategy_test_lib",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -186,10 +192,10 @@ cuda_py_test(
         ":mirrored_strategy",
         ":values",
         ":strategy_test_lib",
+        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:layers",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index bbe5e877d59518056db3fea251cdae0ed854d0e4..cff717db80f0bdd377b3c9c7e8ca3578ff273930 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -488,7 +488,8 @@ class AllReduceCrossTowerOps(CrossTowerOps):
           "agg_small_grads_max_group = %d", len(per_device_values),
           self.all_reduce_alg, self.agg_small_grads_max_bytes,
           self.agg_small_grads_max_group)
-      tensor_packer = AggregateSmallTensorPacker(100, 10)
+      tensor_packer = AggregateSmallTensorPacker(
+          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
       device_grad_packs = tensor_packer.pack(grouped)
     else:
       logging.info(
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index fec6eafd4a66d38e9c99163b059bfeb81d8ad120..20e432b88dc60d45fd32710574ed6e57d0f8a792 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -1174,6 +1174,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "softsign_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/softsign_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "square_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
index e0d65c79b2654c2949de161d6317f218d11cab43..042c8ebd51c47facfc5c942cae56bd56be9df7c5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
@@ -18,11 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 # pylint: disable=g-importing-member
 from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value import AbsoluteValue
-from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -35,50 +32,38 @@ class AbsoluteValueTest(test.TestCase):
 
   def testBijectorVersusNumpyRewriteOfBasicFunctionsEventNdims0(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       self.assertEqual("absolute_value", bijector.name)
       x = array_ops.constant([[0., 1., -1], [0., -5., 3.]])  # Shape [2, 3]
       y = math_ops.abs(x)
 
       y_ = y.eval()
-      zeros = np.zeros((2, 3))
 
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
       # Run things twice to make sure there are no issues in caching the tuples
       # returned by .inverse*
       self.assertAllClose(y_, bijector.forward(x).eval())
       self.assertAllClose((-y_, y_), sess.run(bijector.inverse(y)))
-      self.assertAllClose((zeros, zeros),
-                          sess.run(bijector.inverse_log_det_jacobian(y)))
-
-  def testEventNdimsMustBeZeroOrRaiseStatic(self):
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "event_ndims.*was not 0"):
-        AbsoluteValue(event_ndims=1)
-
-  def testEventNdimsMustBeZeroOrRaiseDynamic(self):
-    with self.test_session() as sess:
-      event_ndims = array_ops.placeholder(dtypes.int32)
-      abs_bijector = AbsoluteValue(event_ndims=event_ndims, validate_args=True)
-      with self.assertRaisesOpError("event_ndims was not 0"):
-        sess.run(abs_bijector.inverse_log_det_jacobian([1.]),
-                 feed_dict={event_ndims: 1})
+      self.assertAllClose((0., 0.),
+                          sess.run(bijector.inverse_log_det_jacobian(
+                              y, event_ndims=0)))
 
   def testNegativeYRaisesForInverseIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
         sess.run(bijector.inverse(-1.))
 
   def testNegativeYRaisesForILDJIfValidateArgs(self):
     with self.test_session() as sess:
-      bijector = AbsoluteValue(event_ndims=0, validate_args=True)
+      bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
-        sess.run(bijector.inverse_log_det_jacobian(-1.))
+        sess.run(bijector.inverse_log_det_jacobian(-1., event_ndims=0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
index 405ddd292cacd8ace87d6caeebf3e8cfc347c22d..1e4ad724d00f751a55370ef9aa6dde0003a2098c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -38,9 +38,11 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(
+          y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
   def testDiag(self):
     with self.test_session():
@@ -58,14 +60,16 @@ class AffineLinearOperatorTest(test.TestCase):
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testTriL(self):
     with self.test_session():
       shift = np.array([-1, 0, 1], dtype=np.float32)
-      tril = np.array([[[1, 0, 0],
+      tril = np.array([[[3, 0, 0],
                         [2, -1, 0],
                         [3, 2, 1]],
                        [[2, 0, 0],
@@ -85,15 +89,17 @@ class AffineLinearOperatorTest(test.TestCase):
       # y = np.matmul(x, tril) + shift.
       y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
       ildj = -np.sum(np.log(np.abs(np.diagonal(
-          tril, axis1=-2, axis2=-1))),
-                     axis=-1)
+          tril, axis1=-2, axis2=-1))))
 
       self.assertEqual(affine.name, "affine_linear_operator")
       self.assertAllClose(y, affine.forward(x).eval())
       self.assertAllClose(x, affine.inverse(y).eval())
-      self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(),
-                          affine.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          ildj, affine.inverse_log_det_jacobian(
+              y, event_ndims=2).eval())
+      self.assertAllClose(
+          -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(),
+          affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
index 16173a166fd943413345036df12245c2a4ab8343..d2533620bebeb0400b6d4a6346e8315c7e37c5c6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
@@ -40,13 +40,13 @@ class AffineScalarBijectorTest(test.TestCase):
   def testNoBatchScalar(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -55,19 +55,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
         self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose([-np.log(2.)] * 3,
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value).astype(np.float64)
         x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = np.float64([1.])
@@ -77,18 +78,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = np.float64([1.])  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
         self.assertAllClose([0.], run(bijector.inverse, x))
-        self.assertAllClose([0.], run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesScaleOnly(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value).astype(np.float64)
         x = array_ops.placeholder(dtypes.float64, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         multiplier = np.float64([2.])
@@ -98,19 +101,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = np.float64([1.])  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
         self.assertAllClose([0.5], run(bijector.inverse, x))
-        self.assertAllClose([np.log(0.5)],
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            [np.log(0.5)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaIdentity(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -120,18 +124,20 @@ class AffineScalarBijectorTest(test.TestCase):
         x = [1., 1]  # One sample from each of two batches.
         self.assertAllClose([2., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose([0., 0.], run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0.,
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaScale(self):
     with self.test_session() as sess:
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value).astype(np.float32)
         x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(fun(x, **kwargs), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -142,7 +148,8 @@ class AffineScalarBijectorTest(test.TestCase):
         self.assertAllClose([3., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
         self.assertAllClose(
-            [-np.log(2), 0.], run(bijector.inverse_log_det_jacobian, x))
+            [-np.log(2), 0.],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index 077e6176b4e7aecb28369d49edad6d1367cc7259..9e14b9a53e6c63876478d876030c476c5d77dbbb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -40,14 +40,15 @@ class AffineBijectorTest(test.TestCase):
 
   def testNoBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -66,18 +67,20 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 1], [-1., -1]]
         self.assertAllClose([[2., 0], [0., -2]], run(bijector.forward, x))
         self.assertAllClose([[0., 2], [-2., 0]], run(bijector.inverse, x))
-        self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            0., run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [1., -1]
@@ -89,9 +92,12 @@ class AffineBijectorTest(test.TestCase):
         # = [-1, -1] + [1, -1]
         self.assertAllClose([3., 0], run(bijector.forward, x))
         self.assertAllClose([0., 2], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
+        # Reset bijector.
+        bijector = Affine(shift=mu, scale_diag=[2., 1])
         # x is a 2-batch of 2-vectors.
         # The first vector is [1, 1], the second is [-1, -1].
         # Each undergoes matmul(sigma, x) + shift.
@@ -103,8 +109,9 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([[0., 2],
                              [-1., 0]],
                             run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
@@ -126,18 +133,20 @@ class AffineBijectorTest(test.TestCase):
       self.assertAllClose([[0., 1]], sess.run(bijector.inverse(x), feed_dict))
       self.assertAllClose(
           -np.log(4),
-          sess.run(bijector.inverse_log_det_jacobian(x), feed_dict))
+          sess.run(bijector.inverse_log_det_jacobian(x, event_ndims=1),
+                   feed_dict))
 
   def testBatchMultivariateIdentity(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
@@ -147,19 +156,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateDiag(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
-        x_value = np.array(x_value, dtype=np.float32)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+      def dynamic_run(fun, x_value, **kwargs):
+        x_value = np.array(x_value)
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = [[1., -1]]
@@ -169,8 +180,9 @@ class AffineBijectorTest(test.TestCase):
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
         self.assertAllClose([[[0., 1]]], run(bijector.inverse, x))
-        self.assertAllClose([-np.log(4)],
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            [-np.log(4)],
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateFullDynamic(self):
     with self.test_session() as sess:
@@ -191,20 +203,22 @@ class AffineBijectorTest(test.TestCase):
       bijector = Affine(shift=mu, scale_diag=scale_diag)
       self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
       self.assertAllClose([[[0., 1]]], sess.run(bijector.inverse(x), feed_dict))
-      self.assertAllClose([-np.log(4)],
-                          sess.run(
-                              bijector.inverse_log_det_jacobian(x), feed_dict))
+      self.assertAllClose(
+          [-np.log(4)],
+          sess.run(bijector.inverse_log_det_jacobian(
+              x, event_ndims=1), feed_dict))
 
   def testIdentityWithDiagUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -216,19 +230,21 @@ class AffineBijectorTest(test.TestCase):
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
         self.assertAllClose([1., 1.5, 2.], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(2.**3),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(2.**3),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -240,19 +256,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 5]], run(bijector.forward, x))
         self.assertAllClose([[1., 0.5]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(4.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(4.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -262,19 +280,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[1., 7]], run(bijector.forward, x))
         self.assertAllClose([[1., 1 / 3.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(6.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(6.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityAndDiagWithTriL(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -287,19 +307,21 @@ class AffineBijectorTest(test.TestCase):
         x = [[1., 2]]  # One multivariate sample.
         self.assertAllClose([[2., 9]], run(bijector.forward, x))
         self.assertAllClose([[2 / 3., 5 / 12.]], run(bijector.inverse, x))
-        self.assertAllClose(-np.log(12.),
-                            run(bijector.inverse_log_det_jacobian, x))
+        self.assertAllClose(
+            -np.log(12.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -319,22 +341,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1.5, 4 / 3.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(60.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(60.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -353,22 +377,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 1., 0.8], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdate(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -388,22 +414,24 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([0.2, 14 / 15., 4 / 25.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(150.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(150.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdateNoDiagonal(self):
     with self.test_session() as sess:
+      placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
-      def static_run(fun, x):
-        return fun(x).eval()
+      def static_run(fun, x, **kwargs):
+        return fun(x, **kwargs).eval()
 
-      def dynamic_run(fun, x_value):
+      def dynamic_run(fun, x_value, **kwargs):
         x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
-        return sess.run(fun(x), feed_dict={x: x_value})
+        return sess.run(
+            fun(placeholder, **kwargs), feed_dict={placeholder: x_value})
 
       for run in (static_run, dynamic_run):
         mu = -1.
@@ -423,11 +451,12 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose([1 / 3., 8 / 9., 4 / 30.], run(bijector.inverse, x))
         self.assertAllClose(
             run(bijector_ref.inverse, x), run(bijector.inverse, x))
-        self.assertAllClose(-np.log(90.),
-                            run(bijector.inverse_log_det_jacobian, x))
         self.assertAllClose(
-            run(bijector.inverse_log_det_jacobian, x),
-            run(bijector_ref.inverse_log_det_jacobian, x))
+            -np.log(90.),
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
+        self.assertAllClose(
+            run(bijector.inverse_log_det_jacobian, x, event_ndims=1),
+            run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateRaisesWhenSingular(self):
     with self.test_session():
@@ -530,6 +559,7 @@ class AffineBijectorTest(test.TestCase):
             backward = np.squeeze(backward, axis=-1)
           self.assertAllClose(backward, bijector.inverse(x).eval())
 
+          scale *= np.ones(shape=x.shape[:-1], dtype=scale.dtype)
           ildj = -np.log(np.abs(np.linalg.det(scale)))
           # TODO(jvdillon): We need to make it so the scale_identity_multiplier
           # case does not deviate in expected shape. Fixing this will get rid of
@@ -540,7 +570,8 @@ class AffineBijectorTest(test.TestCase):
             ildj = np.squeeze(ildj[0])
           elif ildj.ndim < scale.ndim - 2:
             ildj = np.reshape(ildj, scale.shape[0:-2])
-          self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(x).eval())
+          self.assertAllClose(
+              ildj, bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
 
   def testLegalInputs(self):
     self._testLegalInputs(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
index a215a4a2b1ffbea7951bdb9b4352ed567e0b1e41..c832fcaa686c92f83810e4f99ca3b23ae694b723 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
@@ -83,10 +83,11 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
           moving_mean = array_ops.identity(batch_norm.batchnorm.moving_mean)
           moving_var = array_ops.identity(batch_norm.batchnorm.moving_variance)
           denorm_x = batch_norm.forward(array_ops.identity(norm_x))
-          fldj = batch_norm.forward_log_det_jacobian(x)
+          fldj = batch_norm.forward_log_det_jacobian(
+              x, event_ndims=len(event_dims))
           # Use identity to invalidate cache.
           ildj = batch_norm.inverse_log_det_jacobian(
-              array_ops.identity(denorm_x))
+              array_ops.identity(denorm_x), event_ndims=len(event_dims))
         variables.global_variables_initializer().run()
         # Update variables.
         norm_x_ = sess.run(norm_x)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index a748acd667e58f9b527bab11d8bc4d086996e9f3..ca20442c3940664feab7526110229872a6cdc41f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -20,21 +20,33 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
+class ShapeChanging(bijector.Bijector):
+  """Only used for op_ndims manipulation."""
+
+  def __init__(self, forward_min_event_ndims=0, inverse_min_event_ndims=3):
+    super(ShapeChanging, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
+        validate_args=False, name="shape_changer")
+
+
 class ChainBijectorTest(test.TestCase):
   """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
 
   def testBijector(self):
     with self.test_session():
-      chain = Chain((Exp(event_ndims=1), Softplus(event_ndims=1)))
+      chain = Chain((Exp(), Softplus()))
       self.assertEqual("chain_of_exp_of_softplus", chain.name)
       x = np.asarray([[[1., 2.],
                        [2., 3.]]])
@@ -42,9 +54,10 @@ class ChainBijectorTest(test.TestCase):
       self.assertAllClose(np.log(x - 1.), chain.inverse(x).eval())
       self.assertAllClose(
           -np.sum(np.log(x - 1.), axis=2),
-          chain.inverse_log_det_jacobian(x).eval())
+          chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          np.sum(x, axis=2), chain.forward_log_det_jacobian(x).eval())
+          np.sum(x, axis=2),
+          chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testBijectorIdentity(self):
     with self.test_session():
@@ -54,31 +67,126 @@ class ChainBijectorTest(test.TestCase):
                        [2., 3.]]])
       self.assertAllClose(x, chain.forward(x).eval())
       self.assertAllClose(x, chain.inverse(x).eval())
-      self.assertAllClose(0., chain.inverse_log_det_jacobian(x).eval())
-      self.assertAllClose(0., chain.forward_log_det_jacobian(x).eval())
+      self.assertAllClose(
+          0., chain.inverse_log_det_jacobian(x, event_ndims=1).eval())
+      self.assertAllClose(
+          0., chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Chain((Exp(), Softplus()))
+      chain = Chain((Exp(), Softplus()))
       assert_scalar_congruency(
-          bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+          chain, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = Chain([
+      chain = Chain([
           SoftmaxCentered(validate_args=True),
           SoftmaxCentered(validate_args=True),
       ])
       x = tensor_shape.TensorShape([1])
       y = tensor_shape.TensorShape([2 + 1])
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y, chain.forward_event_shape(x))
       self.assertAllEqual(
           y.as_list(),
-          bijector.forward_event_shape_tensor(x.as_list()).eval())
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+          chain.forward_event_shape_tensor(x.as_list()).eval())
+      self.assertAllEqual(x, chain.inverse_event_shape(y))
       self.assertAllEqual(
           x.as_list(),
-          bijector.inverse_event_shape_tensor(y.as_list()).eval())
+          chain.inverse_event_shape_tensor(y.as_list()).eval())
+
+  def testMinEventNdimsChain(self):
+    chain = Chain([Exp(), Exp(), Exp()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Affine(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Exp(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), Exp(), Softplus(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddDims(self):
+    chain = Chain([ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), Affine()])
+    self.assertEqual(1, chain.forward_min_event_ndims)
+    self.assertEqual(4, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(3, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(), ShapeChanging()])
+    self.assertEqual(0, chain.forward_min_event_ndims)
+    self.assertEqual(6, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingRemoveDims(self):
+    chain = Chain([ShapeChanging(3, 0)])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), Affine()])
+    self.assertEqual(3, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+    chain = Chain([Affine(), ShapeChanging(3, 0)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+    chain = Chain([ShapeChanging(3, 0), ShapeChanging(3, 0)])
+    self.assertEqual(6, chain.forward_min_event_ndims)
+    self.assertEqual(0, chain.inverse_min_event_ndims)
+
+  def testMinEventNdimsShapeChangingAddRemoveDims(self):
+    chain = Chain([
+        ShapeChanging(2, 1),
+        ShapeChanging(3, 0),
+        ShapeChanging(1, 2)])
+    self.assertEqual(4, chain.forward_min_event_ndims)
+    self.assertEqual(1, chain.inverse_min_event_ndims)
+
+  def testChainExpAffine(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Exp(), Affine(scale_diag=scale_diag)])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 27.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(scale_diag * x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(scale_diag * x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testChainAffineExp(self):
+    scale_diag = np.array([1., 2., 3.], dtype=np.float32)
+    chain = Chain([Affine(scale_diag=scale_diag), Exp()])
+    x = [0., np.log(2., dtype=np.float32), np.log(3., dtype=np.float32)]
+    y = [1., 4., 9.]
+    self.assertAllClose(y, self.evaluate(chain.forward(x)))
+    self.assertAllClose(x, self.evaluate(chain.inverse(y)))
+    self.assertAllClose(
+        np.log(6, dtype=np.float32) + np.sum(x),
+        self.evaluate(chain.forward_log_det_jacobian(x, event_ndims=1)))
+
+    self.assertAllClose(
+        -np.log(6, dtype=np.float32) - np.sum(x),
+        self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index f392e83d2c3da9dac43c2e87070e952ae2060b34..e281e81bdf0698c1f7b2f60fb27783dd1351773f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -51,10 +51,13 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(), atol=0., rtol=1e-7)
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(
+              y, event_ndims=2).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=2).eval(),
           atol=0.,
           rtol=1e-7)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 26e0d2a539c78540603281ae0f361987a7bf8d90..8b279ebcd908b6f375b35594ac5f3db9228a1e31 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -27,7 +27,7 @@ class _TestBijector(ConditionalBijector):
 
   def __init__(self):
     super(_TestBijector, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         graph_parents=[],
         is_constant_jacobian=True,
         validate_args=False,
@@ -51,11 +51,15 @@ class ConditionalBijectorTest(test.TestCase):
 
   def testConditionalBijector(self):
     b = _TestBijector()
-    for name in ["forward", "inverse", "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
+    for name in ["forward", "inverse"]:
       method = getattr(b, name)
       with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
-        method(1.0, arg1="b1", arg2="b2")
+        method(1., arg1="b1", arg2="b2")
+
+    for name in ["inverse_log_det_jacobian", "forward_log_det_jacobian"]:
+      method = getattr(b, name)
+      with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
+        method(1., event_ndims=0., arg1="b1", arg2="b2")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index 9970c0b4d86afda188d9401ebaf3c98d3fffbfdf..7be939cd274e6f0e33c9b01c82494755db2caa73 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -31,17 +31,21 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = Exp(event_ndims=1)
+      bijector = Exp()
       self.assertEqual("exp", bijector.name)
       x = [[[1.], [2.]]]
       y = np.exp(x)
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          -np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-bijector.inverse_log_det_jacobian(np.exp(x)).eval(),
-                          bijector.forward_log_det_jacobian(x).eval())
+          -np.squeeze(np.log(y), axis=-1),
+          bijector.inverse_log_det_jacobian(
+              y, event_ndims=1).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(
+              np.exp(x), event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(
+              x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -51,10 +55,10 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Exp(event_ndims=0)
+      bijector = Exp()
       x = np.linspace(-10, 10, num=10).astype(np.float32)
       y = np.logspace(-10, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
index 9a905980c7581a86bbcda8c6c726da57c09fe4f8..54e54c3296a89a4fe29a3cce971760502b65e784 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
@@ -34,7 +34,7 @@ class GumbelBijectorTest(test.TestCase):
     with self.test_session():
       loc = 0.3
       scale = 5.
-      bijector = Gumbel(loc=loc, scale=scale, event_ndims=1, validate_args=True)
+      bijector = Gumbel(loc=loc, scale=scale, validate_args=True)
       self.assertEqual("gumbel", bijector.name)
       x = np.array([[[-3.], [0.], [0.5], [4.2], [12.]]], dtype=np.float32)
       # Gumbel distribution
@@ -43,13 +43,11 @@ class GumbelBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(gumbel_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          np.squeeze(gumbel_dist.logpdf(x), axis=-1),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -60,10 +58,10 @@ class GumbelBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = Gumbel(loc=0., scale=3.0, event_ndims=0, validate_args=True)
+      bijector = Gumbel(loc=0., scale=3.0, validate_args=True)
       x = np.linspace(-10., 10., num=10).astype(np.float32)
       y = np.linspace(0.01, 0.99, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
index 739fa6d439a8bce993ab1b4601489d9bbcd69bee..7d3bd758cd2db307f95d2d934923ea2133dc1217 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -33,15 +33,13 @@ class InlineBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      exp = Exp(event_ndims=1)
+      exp = Exp()
       inline = Inline(
           forward_fn=math_ops.exp,
           inverse_fn=math_ops.log,
-          inverse_log_det_jacobian_fn=(
-              lambda y: -math_ops.reduce_sum(  # pylint: disable=g-long-lambda
-                  math_ops.log(y), reduction_indices=-1)),
-          forward_log_det_jacobian_fn=(
-              lambda x: math_ops.reduce_sum(x, reduction_indices=-1)),
+          inverse_log_det_jacobian_fn=lambda y: -math_ops.log(y),
+          forward_log_det_jacobian_fn=lambda x: x,
+          forward_min_event_ndims=0,
           name="exp")
 
       self.assertEqual(exp.name, inline.name)
@@ -51,9 +49,10 @@ class InlineBijectorTest(test.TestCase):
       self.assertAllClose(x, inline.inverse(y).eval())
       self.assertAllClose(
           -np.sum(np.log(y), axis=-1),
-          inline.inverse_log_det_jacobian(y).eval())
-      self.assertAllClose(-inline.inverse_log_det_jacobian(y).eval(),
-                          inline.forward_log_det_jacobian(x).eval())
+          inline.inverse_log_det_jacobian(y, event_ndims=1).eval())
+      self.assertAllClose(
+          -inline.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          inline.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testShapeGetters(self):
     with self.test_session():
@@ -62,6 +61,7 @@ class InlineBijectorTest(test.TestCase):
           forward_event_shape_fn=lambda x: x.as_list() + [1],
           inverse_event_shape_tensor_fn=lambda x: x[:-1],
           inverse_event_shape_fn=lambda x: x[:-1],
+          forward_min_event_ndims=0,
           name="shape_only")
       x = tensor_shape.TensorShape([1, 2, 3])
       y = tensor_shape.TensorShape([1, 2, 3, 1])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 58ba9cedb1437df4e000ce32fe39664afa76c3b5..8b14c8327f08902044f50483f9f8dfe67b58cd70 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -34,9 +34,9 @@ class InvertBijectorTest(test.TestCase):
     with self.test_session():
       for fwd in [
           bijectors.Identity(),
-          bijectors.Exp(event_ndims=1),
+          bijectors.Exp(),
           bijectors.Affine(shift=[0., 1.], scale_diag=[2., 3.]),
-          bijectors.Softplus(event_ndims=1),
+          bijectors.Softplus(),
           bijectors.SoftmaxCentered(),
       ]:
         rev = bijectors.Invert(fwd)
@@ -46,11 +46,11 @@ class InvertBijectorTest(test.TestCase):
         self.assertAllClose(fwd.inverse(x).eval(), rev.forward(x).eval())
         self.assertAllClose(fwd.forward(x).eval(), rev.inverse(x).eval())
         self.assertAllClose(
-            fwd.forward_log_det_jacobian(x).eval(),
-            rev.inverse_log_det_jacobian(x).eval())
+            fwd.forward_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.inverse_log_det_jacobian(x, event_ndims=1).eval())
         self.assertAllClose(
-            fwd.inverse_log_det_jacobian(x).eval(),
-            rev.forward_log_det_jacobian(x).eval())
+            fwd.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+            rev.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index 074b5f275d107fa49de42df262476bd4aa48ffae..a8089881f684db9f8876d6dd738e52bf2f1f7606 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -34,8 +34,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       a = 2.
       b = 0.3
       bijector = Kumaraswamy(
-          concentration1=a, concentration0=b,
-          event_ndims=0, validate_args=True)
+          concentration1=a, concentration0=b, validate_args=True)
       self.assertEqual("kumaraswamy", bijector.name)
       x = np.array([[[0.1], [0.2], [0.3], [0.4], [0.5]]], dtype=np.float32)
       # Kumaraswamy cdf. This is the same as inverse(x).
@@ -46,13 +45,11 @@ class KumaraswamyBijectorTest(test.TestCase):
                              (b - 1) * np.log1p(-x ** a))
 
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          kumaraswamy_log_pdf,
-          bijector.inverse_log_det_jacobian(x).eval())
+          np.squeeze(kumaraswamy_log_pdf, axis=-1),
+          bijector.inverse_log_det_jacobian(x, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(x).eval(),
-          bijector.forward_log_det_jacobian(y).eval(),
+          -bijector.inverse_log_det_jacobian(x, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(y, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -73,7 +70,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       # endpoints.
       y = np.linspace(.01, 0.99, num=10).astype(np.float32)
       x = 1 - (1 - y ** concentration1) ** concentration0
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index dcfb0eb05185d36d96947905c2eb91b2201aece1..5ba5a2083bf11791d7d58146dc2e6283b524d241 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -79,9 +79,10 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
       forward_x = ma.forward(x)
       # Use identity to invalidate cache.
       inverse_y = ma.inverse(array_ops.identity(forward_x))
-      fldj = ma.forward_log_det_jacobian(x)
+      fldj = ma.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = ma.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = ma.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
index 54590de373441c32cc3214cb04d45cfc2d1807ed..7eef4ab599951bbb624652f13a0091363b36b93d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
@@ -53,8 +53,8 @@ class PermuteBijectorTest(test.TestCase):
           bijector.permutation,
           bijector.inverse(expected_y),
           bijector.forward(expected_x),
-          bijector.forward_log_det_jacobian(expected_x),
-          bijector.inverse_log_det_jacobian(expected_y),
+          bijector.forward_log_det_jacobian(expected_x, event_ndims=1),
+          bijector.inverse_log_det_jacobian(expected_y, event_ndims=1),
       ], feed_dict={permutation_ph: expected_permutation})
       self.assertEqual("permute", bijector.name)
       self.assertAllEqual(expected_permutation, permutation_)
@@ -78,10 +78,9 @@ class PermuteBijectorTest(test.TestCase):
     x = np.random.randn(4, 2, 3)
     y = x[..., permutation]
     with self.test_session():
-      bijector = Permute(
-          permutation=permutation,
-          validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      bijector = Permute(permutation=permutation, validate_args=True)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=1, rtol=1e-6, atol=0)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index de1659aa9f4d0f7d19ec2e8185715573b78eaf2b..85d22830132816cd6c77cd0b07870f3a22ae9798 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -32,8 +32,7 @@ class PowerTransformBijectorTest(test.TestCase):
   def testBijector(self):
     with self.test_session():
       c = 0.2
-      bijector = PowerTransform(
-          power=c, event_ndims=1, validate_args=True)
+      bijector = PowerTransform(power=c, validate_args=True)
       self.assertEqual("power_transform", bijector.name)
       x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
       y = (1. + x * c)**(1. / c)
@@ -41,27 +40,25 @@ class PowerTransformBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
           (c - 1.) * np.sum(np.log(y), axis=-1),
-          bijector.inverse_log_det_jacobian(y).eval())
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = PowerTransform(
-          power=0.2, event_ndims=0, validate_args=True)
+      bijector = PowerTransform(power=0.2, validate_args=True)
       x = np.linspace(-4.999, 10, num=10).astype(np.float32)
       y = np.logspace(0.001, 10, num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
index 46fe7797419a9906ecdad60dd0dfe1e9d7c743ed..2d52895fbe0967cdd2260d6d298a291286858d09 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -52,24 +52,28 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
       forward_x = nvp.forward(x)
       # Use identity to invalidate cache.
       inverse_y = nvp.inverse(array_ops.identity(forward_x))
-      fldj = nvp.forward_log_det_jacobian(x)
+      forward_inverse_y = nvp.forward(inverse_y)
+      fldj = nvp.forward_log_det_jacobian(x, event_ndims=1)
       # Use identity to invalidate cache.
-      ildj = nvp.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      ildj = nvp.inverse_log_det_jacobian(
+          array_ops.identity(forward_x), event_ndims=1)
       variables.global_variables_initializer().run()
       [
           forward_x_,
           inverse_y_,
+          forward_inverse_y_,
           ildj_,
           fldj_,
       ] = sess.run([
           forward_x,
           inverse_y,
+          forward_inverse_y,
           ildj,
           fldj,
       ])
       self.assertEqual("real_nvp", nvp.name)
-      self.assertAllClose(forward_x_, forward_x_, rtol=1e-6, atol=0.)
-      self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=0.)
+      self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-1, atol=0.)
+      self.assertAllClose(x_, inverse_y_, rtol=1e-1, atol=0.)
       self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.)
 
   def testMutuallyConsistent(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index e216d88cb190dc16fc0056186f80817d6f2d7c67..46f2c63f9b0f78b25bb1948e6ea55ab20c5cfa6e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -65,8 +65,8 @@ class _ReshapeBijectorTest(object):
        ildj_) = sess.run((
            bijector.inverse(expected_y),
            bijector.forward(expected_x),
-           bijector.forward_log_det_jacobian(expected_x),
-           bijector.inverse_log_det_jacobian(expected_y),
+           bijector.forward_log_det_jacobian(expected_x, event_ndims=2),
+           bijector.inverse_log_det_jacobian(expected_y, event_ndims=2),
        ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
@@ -301,7 +301,8 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
           event_shape_in=[2, 3],
           event_shape_out=[1, 2, 3],
           validate_args=True)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
   def testInvalidDimensionsOpError(self):
     if ops._USE_C_API:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index e4f9d72785c301284812a48c0a67614ca439ffae..cea4a62c22af5d98d38ee881b29c773e6a27a4b4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -36,12 +36,13 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
       y = special.expit(x)
       ildj = -np.log(y) - np.log1p(-y)
-      self.assertAllClose(y, Sigmoid().forward(x).eval(), atol=0., rtol=1e-2)
-      self.assertAllClose(x, Sigmoid().inverse(y).eval(), atol=0., rtol=1e-4)
-      self.assertAllClose(ildj, Sigmoid().inverse_log_det_jacobian(y).eval(),
-                          atol=0., rtol=1e-6)
-      self.assertAllClose(-ildj, Sigmoid().forward_log_det_jacobian(x).eval(),
-                          atol=0., rtol=1e-4)
+      bijector = Sigmoid()
+      self.assertAllClose(y, bijector.forward(x).eval(), atol=0., rtol=1e-2)
+      self.assertAllClose(x, bijector.inverse(y).eval(), atol=0., rtol=1e-4)
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval(), atol=0., rtol=1e-6)
+      self.assertAllClose(-ildj, bijector.forward_log_det_jacobian(
+          x, event_ndims=0).eval(), atol=0., rtol=1e-4)
 
   def testScalarCongruency(self):
     with self.test_session():
@@ -52,7 +53,8 @@ class SigmoidBijectorTest(test.TestCase):
       x = np.linspace(-7., 7., 100).astype(np.float32)
       eps = 1e-3
       y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
-      assert_bijective_and_finite(Sigmoid(), x, y, atol=0., rtol=1e-4)
+      assert_bijective_and_finite(
+          Sigmoid(), x, y, event_ndims=0, atol=0., rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 172c180a44229089f06f250a872bc47a89991cf0..45760a29ee42835da69ef63803ccec7ce82a5a8f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -39,7 +39,6 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(
           skewness=skewness,
           tailweight=tailweight,
-          event_ndims=1,
           validate_args=True)
       self.assertEqual("SinhArcsinh", bijector.name)
       x = np.array([[[-2.01], [2.], [1e-4]]]).astype(np.float32)
@@ -50,10 +49,11 @@ class SinhArcsinhBijectorTest(test.TestCase):
           np.sum(
               np.log(np.cosh(np.arcsinh(y) / tailweight - skewness)) -
               np.log(tailweight) - np.log(np.sqrt(y**2 + 1)),
-              axis=-1), bijector.inverse_log_det_jacobian(y).eval())
+              axis=-1),
+          bijector.inverse_log_det_jacobian(y, event_ndims=1).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=1).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -106,14 +106,15 @@ class SinhArcsinhBijectorTest(test.TestCase):
       bijector = SinhArcsinh(skewness=-1., tailweight=0.5, validate_args=True)
       x = np.concatenate((-np.logspace(-2, 10, 1000), [0], np.logspace(
           -2, 10, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectiveAndFiniteSkewness1Tailweight3(self):
     with self.test_session():
       bijector = SinhArcsinh(skewness=1., tailweight=3., validate_args=True)
       x = np.concatenate((-np.logspace(-2, 5, 1000), [0], np.logspace(
           -2, 5, 1000))).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+      assert_bijective_and_finite(
+          bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectorEndpoints(self):
     with self.test_session():
@@ -124,7 +125,8 @@ class SinhArcsinhBijectorTest(test.TestCase):
             [np.finfo(dtype).min, np.finfo(dtype).max], dtype=dtype)
         # Note that the above bijector is the identity bijector. Hence, the
         # log_det_jacobian will be 0. Because of this we use atol.
-        assert_bijective_and_finite(bijector, bounds, bounds, atol=2e-6)
+        assert_bijective_and_finite(
+            bijector, bounds, bounds, event_ndims=0, atol=2e-6)
 
   def testBijectorOverRange(self):
     with self.test_session():
@@ -156,12 +158,12 @@ class SinhArcsinhBijectorTest(test.TestCase):
                 np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
                     y_float128**2 + 1)) -
             np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y).eval(),
+            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
         self.assertAllClose(
-            -bijector.inverse_log_det_jacobian(y).eval(),
-            bijector.forward_log_det_jacobian(x).eval(),
+            -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+            bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
             rtol=1e-4,
             atol=0.)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index cad4dd1ac8de0da6405aacb9047714b37eec73e3..0f0a2fa531a0585a709df4c2c3e2631e5c275986 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -44,12 +44,12 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       self.assertAllClose(x, softmax.inverse(y).eval())
       self.assertAllClose(
           -np.sum(np.log(y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(),
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(),
-          softmax.forward_log_det_jacobian(x).eval(),
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(),
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(),
           atol=0.,
           rtol=1e-7)
 
@@ -67,14 +67,14 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
           feed_dict={y: real_y}))
       self.assertAllClose(
           -np.sum(np.log(real_y), axis=1),
-          softmax.inverse_log_det_jacobian(y).eval(
+          softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
           atol=0.,
           rtol=1e-7)
       self.assertAllClose(
-          -softmax.inverse_log_det_jacobian(y).eval(
+          -softmax.inverse_log_det_jacobian(y, event_ndims=1).eval(
               feed_dict={y: real_y}),
-          softmax.forward_log_det_jacobian(x).eval(
+          softmax.forward_log_det_jacobian(x, event_ndims=1).eval(
               feed_dict={x: real_x}),
           atol=0.,
           rtol=1e-7)
@@ -104,7 +104,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       y = np.array([y_0, y_1, y_2])
       y /= y.sum(axis=0)
       y = y.T  # y.shape = [5, 3]
-      assert_bijective_and_finite(softmax, x, y)
+      assert_bijective_and_finite(softmax, x, y, event_ndims=1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index d9af9aec50d3d69bb10f69f2ffd6ca3a24c316f8..3d8a0a32bba3539f732140e8eb7ebeb532d73ff5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -43,13 +43,13 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testHingeSoftnessZeroRaises(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      bijector = Softplus(hinge_softness=0., validate_args=True)
       with self.assertRaisesOpError("must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
   def testBijectorForwardInverseEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -59,7 +59,7 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      bijector = Softplus(hinge_softness=1.5)
       x = 2 * rng.randn(2, 10)
       y = 1.5 * self._softplus(x / 1.5)
 
@@ -68,16 +68,17 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsZero(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       # No reduction needed if event_dims = 0.
       ildj = self._softplus_ildj_before_reduction(y)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=0).eval())
 
   def testBijectorForwardInverseEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -87,58 +88,59 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsOne(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=1)
+      bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       ildj_before = self._softplus_ildj_before_reduction(y)
       ildj = np.sum(ildj_before, axis=1)
 
-      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(ildj, bijector.inverse_log_det_jacobian(
+          y, event_ndims=1).eval())
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithPositiveHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      bijector = Softplus(hinge_softness=1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithNegativeHingeSoftness(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      bijector = Softplus(hinge_softness=-1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testBijectiveAndFinite32bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      bijector = Softplus(hinge_softness=1.23)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      bijector = Softplus(hinge_softness=-0.7)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = -np.logspace(-10, 10, 100).astype(np.float32)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-2, atol=1e-2)
+          bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFinite16bit(self):
     with self.test_session():
-      bijector = Softplus(event_ndims=0)
+      bijector = Softplus()
       # softplus(-20) is zero, so we can't use such a large range as in 32bit.
       x = np.linspace(-10., 20., 100).astype(np.float16)
       # Note that float16 is only in the open set (0, inf) for a smaller
@@ -146,7 +148,7 @@ class SoftplusBijectorTest(test.TestCase):
       # for the test.
       y = np.logspace(-6, 3, 100).astype(np.float16)
       assert_bijective_and_finite(
-          bijector, x, y, rtol=1e-1, atol=1e-3)
+          bijector, x, y, event_ndims=0, rtol=1e-1, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac06fce55b448a5f3da7ccb7f8766b5b1404ad7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import Softsign
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+
+class SoftsignBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = g(X) = X / (1 + |X|) transformation."""
+
+  def _softsign(self, x):
+    return x / (1. + np.abs(x))
+
+  def _softsign_ildj_before_reduction(self, y):
+    """Inverse log det jacobian, before being reduced."""
+    return -2. * np.log1p(-np.abs(y))
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorBounds(self):
+    bijector = Softsign(validate_args=True)
+    with self.test_session():
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse(-3.).eval()
+      with self.assertRaisesOpError("greater than -1"):
+        bijector.inverse_log_det_jacobian(-3., event_ndims=0).eval()
+
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse(3.).eval()
+      with self.assertRaisesOpError("less than 1"):
+        bijector.inverse_log_det_jacobian(3., event_ndims=0).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverse(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsZero(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    # No reduction needed if event_dims = 0.
+    ildj = self._softsign_ildj_before_reduction(y)
+
+    self.assertAllClose(ildj, self.evaluate(
+        bijector.inverse_log_det_jacobian(y, event_ndims=0)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorForwardInverseEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    self.assertEqual("softsign", bijector.name)
+    x = 2. * self._rng.randn(2, 10)
+    y = self._softsign(x)
+    self.assertAllClose(y, self.evaluate(bijector.forward(x)))
+    self.assertAllClose(x, self.evaluate(bijector.inverse(y)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorLogDetJacobianEventDimsOne(self):
+    bijector = Softsign(validate_args=True)
+    y = self._rng.rand(2, 10)
+    ildj_before = self._softsign_ildj_before_reduction(y)
+    ildj = np.sum(ildj_before, axis=1)
+    self.assertAllClose(
+        ildj, self.evaluate(
+            bijector.inverse_log_det_jacobian(y, event_ndims=1)))
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=-20., upper_x=20.)
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      bijector = Softsign(validate_args=True)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.linspace(-0.99, 0.99, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, event_ndims=0, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
index f03d6f1343a11ae4517f9034ceb0c99ca6fe7fa2..30c7a738c320b609ce90685512e6b8344dffc9dc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
@@ -41,10 +41,11 @@ class SquareBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          ildj, bijector.inverse_log_det_jacobian(y).eval(), atol=0., rtol=1e-7)
+          ildj, bijector.inverse_log_det_jacobian(
+              y, event_ndims=0).eval(), atol=0., rtol=1e-7)
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
           atol=0.,
           rtol=1e-7)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
index 7a31228d1ade55ce32b511dca073657d3bab53ae..f57adcda898a1fdb18aacbb0804411db1bb4e4c8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
@@ -36,7 +36,7 @@ class WeibullBijectorTest(test.TestCase):
       concentration = 0.3
       bijector = Weibull(
           scale=scale, concentration=concentration,
-          event_ndims=1, validate_args=True)
+          validate_args=True)
       self.assertEqual("weibull", bijector.name)
       x = np.array([[[0.], [1.], [14.], [20.], [100.]]], dtype=np.float32)
       # Weibull distribution
@@ -45,13 +45,11 @@ class WeibullBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
       self.assertAllClose(
-          # We should lose a dimension from calculating the determinant of the
-          # jacobian.
-          np.squeeze(weibull_dist.logpdf(x), axis=2),
-          bijector.forward_log_det_jacobian(x).eval())
+          weibull_dist.logpdf(x),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval())
       self.assertAllClose(
-          -bijector.inverse_log_det_jacobian(y).eval(),
-          bijector.forward_log_det_jacobian(x).eval(),
+          -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+          bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
           rtol=1e-4,
           atol=0.)
 
@@ -64,12 +62,12 @@ class WeibullBijectorTest(test.TestCase):
   def testBijectiveAndFinite(self):
     with self.test_session():
       bijector = Weibull(
-          scale=20., concentration=2., event_ndims=0, validate_args=True)
+          scale=20., concentration=2., validate_args=True)
       x = np.linspace(1., 8., num=10).astype(np.float32)
       y = np.linspace(
           -np.expm1(-1 / 400.),
           -np.expm1(-16), num=10).astype(np.float32)
-      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, event_ndims=0, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index 545471907f1eabc822b3d28ea9c57e183a09ff50..4e8989b6c2f93560b1fccbc99491d7809f494263 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -44,6 +44,7 @@ class _ChooseLocation(ConditionalBijector):
           graph_parents=[self._loc],
           is_constant_jacobian=True,
           validate_args=False,
+          forward_min_event_ndims=0,
           name=name)
 
   def _forward(self, x, z):
@@ -52,7 +53,7 @@ class _ChooseLocation(ConditionalBijector):
   def _inverse(self, x, z):
     return x - self._gather_loc(z)
 
-  def _inverse_log_det_jacobian(self, x, z=None):
+  def _inverse_log_det_jacobian(self, x, event_ndims, z=None):
     return 0.
 
   def _gather_loc(self, z):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 933756aa8e12cca4c42eb98d9193512bbf2ad585..9635134b08db47a47a17c869fe813e0376ae6f1e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -68,7 +68,7 @@ class MultivariateNormalDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index 0400c80c29cf0c36090168b7a1a6358ad49fde49..ce6cf702d522792f1ad26066a3d9be42003a0e3c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import statistical_testing as st
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
@@ -129,13 +129,13 @@ class StatisticalTestingTest(test.TestCase):
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.4.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too high"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.4, false_fail_rate=1e-6))
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.6.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too low"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.6, false_fail_rate=1e-6))
 
@@ -172,7 +172,7 @@ class StatisticalTestingTest(test.TestCase):
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(2, 1).
       beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples1 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_high_samples, 0., 1.,
@@ -190,7 +190,7 @@ class StatisticalTestingTest(test.TestCase):
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(1, 2).
       beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples2 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_low_samples, 0., 1.,
@@ -198,23 +198,46 @@ class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_argument_validity_checking(self):
     rng = np.random.RandomState(seed=0)
-    samples = rng.uniform(size=5000).astype(np.float32)
+    samples = rng.uniform(
+        low=[0., 1.], high=[1., 2.], size=(2500, 1, 2)).astype(np.float32)
 
     # Test that the test library complains if the given samples fall
     # outside the purported bounds.
     with self.test_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("maximum value exceeds expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0., 0.5, error_rate=0.5))
-      with self.assertRaises(errors.InvalidArgumentError):
+            samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
+      with self.assertRaisesOpError("minimum value falls below expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0.5, 1., error_rate=0.5))
+            samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
 
       # But doesn't complain if they don't.
       op = st.true_mean_confidence_interval_by_dkwm(
-          samples, 0., 1., error_rate=0.5)
+          samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
       _ = sess.run(op)
 
+  def test_do_maximum_mean(self):
+    n = 117
+    envelope = 0.02  # > 2 / n, but < 3 / n
+    rng = np.random.RandomState(seed=8)
+    samples = rng.uniform(size=n).astype(np.float32)
+
+    # Compute the answer in TF using the code under test
+    with self.test_session() as sess:
+      envelope_t = ops.convert_to_tensor(envelope)
+      max_mean = st._do_maximum_mean(samples, envelope_t, 1)
+      max_mean = sess.run(max_mean)
+
+    # Compute the correct answer for this case in numpy.  In this
+    # example, `n` and `envelope` are such that `samples[2]` is the
+    # element that should be taken partially, regardless of the
+    # content of the `samples` array (see algorithm description in
+    # `../ops/statistical_testing.py`).
+    samples = sorted(samples)
+    weight = 1. / n - (envelope - 2. / n)
+    answer = samples[2] * weight + sum(samples[3:]) / n + envelope * 1.
+    self.assertAllClose(max_mean, answer, rtol=1e-9)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index f0ba1ec3eb57c67c1a0edb15639e91916a4509b7..5fe1331d2c34612e980c7b376367cd63b627533d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -36,6 +37,35 @@ ds = distributions
 la = linalg
 
 
+class DummyMatrixTransform(bs.Bijector):
+  """Tractable matrix transformation.
+
+  This is a non-sensical bijector that has forward/inverse_min_event_ndims=2.
+  The main use is to check that transformed distribution calculations are done
+  appropriately.
+  """
+
+  def __init__(self):
+    super(DummyMatrixTransform, self).__init__(
+        forward_min_event_ndims=2,
+        is_constant_jacobian=False,
+        validate_args=False,
+        name="dummy")
+
+  def _forward(self, x):
+    return x
+
+  def _inverse(self, y):
+    return y
+
+  # Note: These jacobians don't make sense.
+  def _forward_log_det_jacobian(self, x):
+    return -linalg_ops.matrix_determinant(x)
+
+  def _inverse_log_det_jacobian(self, x):
+    return linalg_ops.matrix_determinant(x)
+
+
 class TransformedDistributionTest(test.TestCase):
 
   def _cls(self):
@@ -55,7 +85,7 @@ class TransformedDistributionTest(test.TestCase):
       # you may or may not need a reduce_sum.
       log_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.Exp(event_ndims=0))
+          bijector=bs.Exp())
       sp_dist = stats.lognorm(s=sigma, scale=np.exp(mu))
 
       # sample
@@ -87,7 +117,7 @@ class TransformedDistributionTest(test.TestCase):
       sigma = 2.0
       abs_normal = self._cls()(
           distribution=ds.Normal(loc=mu, scale=sigma),
-          bijector=bs.AbsoluteValue(event_ndims=0))
+          bijector=bs.AbsoluteValue())
       sp_normal = stats.norm(mu, sigma)
 
       # sample
@@ -129,7 +159,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(grid, cdf_, rtol=1e-6, atol=0.)
 
   def testCachedSamples(self):
-    exp_forward_only = bs.Exp(event_ndims=0)
+    exp_forward_only = bs.Exp()
     exp_forward_only._inverse = self._make_unimplemented(
         "inverse")
     exp_forward_only._inverse_event_shape_tensor = self._make_unimplemented(
@@ -153,7 +183,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, log_pdf_val, rtol=1e-4, atol=0.)
 
   def testCachedSamplesInvert(self):
-    exp_inverse_only = bs.Exp(event_ndims=0)
+    exp_inverse_only = bs.Exp()
     exp_inverse_only._forward = self._make_unimplemented(
         "forward")
     exp_inverse_only._forward_event_shape_tensor = self._make_unimplemented(
@@ -210,8 +240,11 @@ class TransformedDistributionTest(test.TestCase):
       int_identity = bs.Inline(
           forward_fn=array_ops.identity,
           inverse_fn=array_ops.identity,
-          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
-          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          inverse_log_det_jacobian_fn=(
+              lambda y: math_ops.cast(0, dtypes.int32)),
+          forward_log_det_jacobian_fn=(
+              lambda x: math_ops.cast(0, dtypes.int32)),
+          forward_min_event_ndims=0,
           is_constant_jacobian=True)
       normal = self._cls()(
           distribution=ds.Normal(loc=0., scale=1.),
@@ -435,6 +468,82 @@ class ScalarToMultiTest(test.TestCase):
             event_shape=[3],
             validate_args=True)
 
+  def testMatrixEvent(self):
+    with self.test_session() as sess:
+      batch_shape = [2]
+      event_shape = [2, 3, 3]
+      batch_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_batch_shape")
+      event_shape_pl = array_ops.placeholder(
+          dtypes.int32, name="dynamic_event_shape")
+      feed_dict = {batch_shape_pl: np.array(batch_shape, dtype=np.int32),
+                   event_shape_pl: np.array(event_shape, dtype=np.int32)}
+
+      scale = 2.
+      loc = 0.
+      fake_mvn_dynamic = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape_pl,
+          event_shape=event_shape_pl,
+          validate_args=True)
+
+      fake_mvn_static = self._cls()(
+          distribution=ds.Normal(
+              loc=loc,
+              scale=scale),
+          bijector=DummyMatrixTransform(),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=True)
+
+      def actual_mvn_log_prob(x):
+        # This distribution is the normal PDF, reduced over the
+        # last 3 dimensions + a jacobian term which corresponds
+        # to the determinant of x.
+        return (np.sum(
+            stats.norm(loc, scale).logpdf(x), axis=(-1, -2, -3)) +
+                np.sum(np.linalg.det(x), axis=-1))
+
+      self.assertAllEqual([2, 3, 3], fake_mvn_static.event_shape)
+      self.assertAllEqual([2], fake_mvn_static.batch_shape)
+
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.event_shape)
+      self.assertAllEqual(tensor_shape.TensorShape(None),
+                          fake_mvn_dynamic.batch_shape)
+
+      num_samples = 5e3
+      for fake_mvn, feed_dict in ((fake_mvn_static, {}),
+                                  (fake_mvn_dynamic, feed_dict)):
+        # Ensure sample works by checking first, second moments.
+        y = fake_mvn.sample(int(num_samples), seed=0)
+        x = y[0:5, ...]
+        [
+            x_,
+            fake_event_shape_,
+            fake_batch_shape_,
+            fake_log_prob_,
+            fake_prob_,
+        ] = sess.run([
+            x,
+            fake_mvn.event_shape_tensor(),
+            fake_mvn.batch_shape_tensor(),
+            fake_mvn.log_prob(x),
+            fake_mvn.prob(x),
+        ], feed_dict=feed_dict)
+
+        # Ensure all other functions work as intended.
+        self.assertAllEqual([5, 2, 2, 3, 3], x_.shape)
+        self.assertAllEqual([2, 3, 3], fake_event_shape_)
+        self.assertAllEqual([2], fake_batch_shape_)
+        self.assertAllClose(actual_mvn_log_prob(x_), fake_log_prob_,
+                            atol=0., rtol=1e-6)
+        self.assertAllClose(np.exp(actual_mvn_log_prob(x_)), fake_prob_,
+                            atol=0., rtol=1e-5)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
index c355adeedbfff1072281a81de726ddb0ece07882..1226c66113ec4b43f57371abf4983aef1a529ec1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -61,7 +61,7 @@ class VectorLaplaceDiagTest(test.TestCase):
       dist = ds.TransformedDistribution(
           base_dist,
           validate_args=True,
-          bijector=bijectors.Softplus(event_ndims=1))
+          bijector=bijectors.Softplus())
       samps = dist.sample(5)  # Shape [5, 1, 3].
       self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index bc6b02542ebf3b83d58f888509dafb86351de8a7..babce80396cfc41b53e99f91038d4f077c7efe82 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -38,6 +38,7 @@
 @@SinhArcsinh
 @@SoftmaxCentered
 @@Softplus
+@@Softsign
 @@Square
 @@Weibull
 
@@ -74,6 +75,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.contrib.distributions.python.ops.bijectors.softsign import *
 from tensorflow.contrib.distributions.python.ops.bijectors.square import *
 from tensorflow.python.ops.distributions.bijector import *
 from tensorflow.python.ops.distributions.identity_bijector import Identity
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 0fe9f6aa78fbe845b99d0668f075b0162ec2a9f7..c9e31d7712f09f6c4b4cc6ae51a34c42a19c291d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -72,38 +70,22 @@ class AbsoluteValue(bijector.Bijector):
 
   """
 
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+  def __init__(self, validate_args=False, name="absolute_value"):
     """Instantiates the `AbsoluteValue` bijector.
 
     Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness, in particular whether inputs to `inverse` and
         `inverse_log_det_jacobian` are non-negative.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
     """
     self._graph_parents = []
     self._name = name
 
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
     with self._name_scope("init"):
       super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
           validate_args=validate_args,
           name=name)
 
@@ -121,8 +103,7 @@ class AbsoluteValue(bijector.Bijector):
     # If event_ndims = 2,
     # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
     # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    zeros = constant_op.constant(0., dtype=y.dtype)
     if self.validate_args:
       zeros = control_flow_ops.with_dependencies(
           [check_ops.assert_non_negative(y, message="Argument y was negative")],
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index bef7bbb49b715497695f7513e19ecab4fa56c47e..b4c2939eb914d50475ba6b1c1e979a804090f641 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -184,6 +184,7 @@ class Affine(bijector.Bijector):
     with self._name_scope("init", values=[
         shift, scale_identity_multiplier, scale_diag, scale_tril,
         scale_perturb_diag, scale_perturb_factor]):
+
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
 
@@ -234,7 +235,7 @@ class Affine(bijector.Bijector):
           event_ndims=1,
           validate_args=validate_args)
       super(Affine, self).__init__(
-          event_ndims=1,
+          forward_min_event_ndims=1,
           graph_parents=(
               [self._scale] if tensor_util.is_tensor(self._scale)
               else self._scale.graph_parents +
@@ -360,16 +361,17 @@ class Affine(bijector.Bijector):
         x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
   def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self._is_only_identity_multiplier:
       # We don't pad in this case and instead let the fldj be applied
       # via broadcast.
       event_size = array_ops.shape(x)[-1]
       event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
       return math_ops.log(math_ops.abs(self._scale)) * event_size
+
     return self.scale.log_abs_determinant()
 
   def _maybe_check_scale(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index 89043b1410370074f11f2cfa59b6b6663fa62521..59f9742d576a7804f401d3a47ba31ae61d6c6e54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -22,9 +22,6 @@ from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.linalg import linear_operator
 
@@ -94,7 +91,6 @@ class AffineLinearOperator(bijector.Bijector):
   def __init__(self,
                shift=None,
                scale=None,
-               event_ndims=1,
                validate_args=False,
                name="affine_linear_operator"):
     """Instantiates the `AffineLinearOperator` bijector.
@@ -103,14 +99,11 @@ class AffineLinearOperator(bijector.Bijector):
       shift: Floating-point `Tensor`.
       scale:  Subclass of `LinearOperator`. Represents the (batch) positive
         definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
 
     Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
       TypeError: if `scale` is not a `LinearOperator`.
       TypeError: if `shift.dtype` does not match `scale.dtype`.
       ValueError: if not `scale.is_non_singular`.
@@ -120,20 +113,6 @@ class AffineLinearOperator(bijector.Bijector):
     self._validate_args = validate_args
     graph_parents = []
     with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
       # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
       dtype = dtypes.float32
 
@@ -166,10 +145,10 @@ class AffineLinearOperator(bijector.Bijector):
       self._scale = scale
       self._shaper = _DistributionShape(
           batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
+          event_ndims=1,
           validate_args=validate_args)
       super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
+          forward_min_event_ndims=1,
           graph_parents=graph_parents,
           is_constant_jacobian=True,
           dtype=dtype,
@@ -213,12 +192,13 @@ class AffineLinearOperator(bijector.Bijector):
           x, sample_shape, expand_batch_dim=False)
     return x
 
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+  def _forward_log_det_jacobian(self, x):
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
     with ops.control_dependencies(self._maybe_collect_assertions() if
                                   self.validate_args else []):
       return self.scale.log_abs_determinant()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
index 8adaa54c843d1b243a02967402a37b7c63fabbdf..cd792e2c8cf48602daf9fb5eb56b8c34bac050c7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_scalar.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -99,7 +100,7 @@ class AffineScalar(bijector.Bijector):
               self._scale)
 
       super(AffineScalar, self).__init__(
-          event_ndims=0,
+          forward_min_event_ndims=0,
           is_constant_jacobian=True,
           validate_args=validate_args,
           name=name)
@@ -131,8 +132,10 @@ class AffineScalar(bijector.Bijector):
     return x
 
   def _forward_log_det_jacobian(self, x):
-    log_det_jacobian = array_ops.zeros_like(x)
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
     if self.scale is None:
-      return log_det_jacobian
-    log_det_jacobian += math_ops.log(math_ops.abs(self.scale))
-    return log_det_jacobian
+      return constant_op.constant(0., dtype=x.dtype.base_dtype)
+
+    return math_ops.log(math_ops.abs(self.scale))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
index 33fdd32d7a0a01685690e598c69adca2c95972e9..224cec8a63dba53a528490117efac890312fe8d5 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/batch_normalization.py
@@ -157,7 +157,12 @@ class BatchNormalization(bijector.Bijector):
         gamma_constraint=g_constraint)
     self._validate_bn_layer(self.batchnorm)
     self._training = training
+    if isinstance(self.batchnorm.axis, int):
+      forward_min_event_ndims = 1
+    else:
+      forward_min_event_ndims = len(self.batchnorm.axis)
     super(BatchNormalization, self).__init__(
+        forward_min_event_ndims=forward_min_event_ndims,
         validate_args=validate_args, name=name)
 
   def _validate_bn_layer(self, layer):
@@ -186,7 +191,6 @@ class BatchNormalization(bijector.Bijector):
     input_shape = np.int32(x.shape.as_list())
 
     ndims = len(input_shape)
-    # event_dims = self._compute_event_dims(x)
     reduction_axes = [i for i in range(ndims) if i not in self.batchnorm.axis]
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 3ce7c26213034c7345a20faa803c94a1bfa8d579..85ad23e4133ef09051cdc8b45e489caeea90fbb3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -21,6 +21,9 @@ from __future__ import print_function
 import itertools
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -29,6 +32,91 @@ __all__ = [
 ]
 
 
+def _use_static_shape(input_tensor, ndims):
+  return input_tensor.shape.is_fully_defined() and isinstance(ndims, int)
+
+
+def _maybe_get_event_ndims_statically(event_ndims):
+  static_event_ndims = (event_ndims if isinstance(event_ndims, int)
+                        else tensor_util.constant_value(event_ndims))
+  if static_event_ndims is not None:
+    return static_event_ndims
+
+  return event_ndims
+
+
+def _compute_min_event_ndims(bijector_list, compute_forward=True):
+  """Computes the min_event_ndims associated with the give list of bijectors.
+
+  Given a list `bijector_list` of bijectors, compute the min_event_ndims that is
+  associated with the composition of bijectors in that list.
+
+  min_event_ndims is the # of right most dimensions for which the bijector has
+  done necessary computation on (i.e. the non-broadcastable part of the
+  computation).
+
+  We can derive the min_event_ndims for a chain of bijectors as follows:
+
+  In the case where there are no rank changing bijectors, this will simply be
+  `max(b.forward_min_event_ndims for b in bijector_list)`. This is because the
+  bijector with the most forward_min_event_ndims requires the most dimensions,
+  and hence the chain also requires operating on those dimensions.
+
+  However in the case of rank changing, more care is needed in determining the
+  exact amount of dimensions. Padding dimensions causes subsequent bijectors to
+  operate on the padded dimensions, and Removing dimensions causes bijectors to
+  operate more left.
+
+  Args:
+    bijector_list: List of bijectors to be composed by chain.
+    compute_forward: Boolean. If True, computes the min_event_ndims associated
+      with a forward call to Chain, and otherwise computes the min_event_ndims
+      associated with an inverse call to Chain. The latter is the same as the
+      min_event_ndims associated with a forward call to Invert(Chain(....)).
+
+  Returns:
+    min_event_ndims
+  """
+  min_event_ndims = 0
+  # This is a mouthful, but what this encapsulates is that if not for rank
+  # changing bijectors, we'd only need to compute the largest of the min
+  # required ndims. Hence "max_min". Due to rank changing bijectors, we need to
+  # account for synthetic rank growth / synthetic rank decrease from a rank
+  # changing bijector.
+  rank_changed_adjusted_max_min_event_ndims = 0
+
+  if compute_forward:
+    bijector_list = reversed(bijector_list)
+
+  for b in bijector_list:
+    if compute_forward:
+      current_min_event_ndims = b.forward_min_event_ndims
+      current_inverse_min_event_ndims = b.inverse_min_event_ndims
+    else:
+      current_min_event_ndims = b.inverse_min_event_ndims
+      current_inverse_min_event_ndims = b.forward_min_event_ndims
+
+    # New dimensions were touched.
+    if rank_changed_adjusted_max_min_event_ndims < current_min_event_ndims:
+      min_event_ndims += (
+          current_min_event_ndims - rank_changed_adjusted_max_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims = max(
+        current_min_event_ndims, rank_changed_adjusted_max_min_event_ndims)
+
+    # If the number of dimensions has increased via forward, then
+    # inverse_min_event_ndims > forward_min_event_ndims, and hence the
+    # dimensions we computed on, have moved left (so we have operated
+    # on additional dimensions).
+    # Conversely, if the number of dimensions has decreased via forward,
+    # then we have inverse_min_event_ndims < forward_min_event_ndims,
+    # and so we will have operated on fewer right most dimensions.
+
+    number_of_changed_dimensions = (
+        current_min_event_ndims - current_inverse_min_event_ndims)
+    rank_changed_adjusted_max_min_event_ndims -= number_of_changed_dimensions
+  return min_event_ndims
+
+
 class Chain(bijector.Bijector):
   """Bijector which applies a sequence of bijectors.
 
@@ -93,21 +181,24 @@ class Chain(bijector.Bijector):
       raise ValueError("incompatible dtypes: %s" % dtype)
     elif len(dtype) == 2:
       dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
     elif len(dtype) == 1:
       dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
     else:
       dtype = None
-      event_ndims = None
+
+    inverse_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=False)
+    forward_min_event_ndims = _compute_min_event_ndims(
+        bijectors, compute_forward=True)
 
     super(Chain, self).__init__(
         graph_parents=list(itertools.chain.from_iterable(
             b.graph_parents for b in bijectors)),
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
         validate_args=validate_args,
         dtype=dtype,
-        event_ndims=event_ndims,
         name=name or ("identity" if not bijectors else
                       "_of_".join(["chain"] + [b.name for b in bijectors])))
 
@@ -147,10 +238,31 @@ class Chain(bijector.Bijector):
     return y
 
   def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
+    ildj = constant_op.constant(
+        0., dtype=y.dtype.base_dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return ildj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.inverse_min_event_ndims)
+
+    if _use_static_shape(y, event_ndims):
+      event_shape = y.shape[y.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(y)[array_ops.rank(y) - event_ndims:]
+
     for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      ildj += b.inverse_log_det_jacobian(
+          y, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+
+      if _use_static_shape(y, event_ndims):
+        event_shape = b.inverse_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.inverse_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
       y = b.inverse(y, **kwargs.get(b.name, {}))
     return ildj
 
@@ -160,9 +272,34 @@ class Chain(bijector.Bijector):
     return x
 
   def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
+    x = ops.convert_to_tensor(x, name="x")
+
+    fldj = constant_op.constant(
+        0., dtype=x.dtype, name="inverse_log_det_jacobian")
+
+    if not self.bijectors:
+      return fldj
+
+    event_ndims = _maybe_get_event_ndims_statically(
+        self.forward_min_event_ndims)
+
+    if _use_static_shape(x, event_ndims):
+      event_shape = x.shape[x.shape.ndims - event_ndims:]
+    else:
+      event_shape = array_ops.shape(x)[array_ops.rank(x) - event_ndims:]
+
     for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      fldj += b.forward_log_det_jacobian(
+          x, event_ndims=event_ndims, **kwargs.get(b.name, {}))
+      if _use_static_shape(x, event_ndims):
+        event_shape = b.forward_event_shape(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+      else:
+        event_shape = b.forward_event_shape_tensor(event_shape)
+        event_ndims = _maybe_get_event_ndims_statically(
+            array_ops.rank(event_shape))
+
       x = b.forward(x, **kwargs.get(b.name, {}))
+
     return fldj
+
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 8f09e16058b766c788ab3acced6940fd0026b521..caae2adcfac7643cdc8f76dd1cccddd516105410 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -80,7 +80,7 @@ class CholeskyOuterProduct(bijector.Bijector):
     self._graph_parents = []
     self._name = name
     super(CholeskyOuterProduct, self).__init__(
-        event_ndims=2,
+        forward_min_event_ndims=2,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index ccb1f029277bc07011df7be047a075274f2b3a27..e9e994f839ab2fe0a0f52f5f404fb2a0c8f9cd94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -44,12 +44,16 @@ class ConditionalBijector(bijector.Bijector):
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+      self, y, event_ndims, name="inverse_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(
+        y, event_ndims, name, **condition_kwargs)
 
   @distribution_util.AppendDocstring(kwargs_dict={
       "**condition_kwargs":
       "Named arguments forwarded to subclass implementation."})
   def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
+      self, x, event_ndims, name="forward_log_det_jacobian",
+      **condition_kwargs):
+    return self._call_forward_log_det_jacobian(
+        x, event_ndims, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index b1ff840d62a73c941a4d67dec73b5c9f4d5353f9..9fc1bbf052b419d07a9db149b990c2b80190d72b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -33,8 +33,8 @@ class Exp(power_transform.PowerTransform):
 
     ```python
     # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
+    # batch ndim 2.
+    exp = Exp()
     x = [[[1., 2],
            [3, 4]],
           [[5, 6],
@@ -48,19 +48,17 @@ class Exp(power_transform.PowerTransform):
   """
 
   def __init__(self,
-               event_ndims=0,
                validate_args=False,
                name="exp"):
     """Instantiates the `Exp` bijector.
 
     Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
     """
+    # forward_min_event_ndims = 0.
+    # No forward_min_event_ndims specified as this is done in PowerTransform.
     super(Exp, self).__init__(
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index 67f39785563255be0fe154aca3cbcf01c6a01e73..e656a258e56e71898ecb719dd2af876f158cf799 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -48,7 +48,6 @@ class Gumbel(bijector.Bijector):
   def __init__(self,
                loc=0.,
                scale=1.,
-               event_ndims=0,
                validate_args=False,
                name="gumbel"):
     """Instantiates the `Gumbel` bijector.
@@ -60,8 +59,6 @@ class Gumbel(bijector.Bijector):
       scale: Positive Float-like `Tensor` that is the same dtype and is
         broadcastable with `loc`.
         This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -80,7 +77,9 @@ class Gumbel(bijector.Bijector):
         ], self._scale)
 
     super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        validate_args=validate_args,
+        forward_min_event_ndims=0,
+        name=name)
 
   @property
   def loc(self):
@@ -102,15 +101,11 @@ class Gumbel(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+    return math_ops.log(self.scale / (-math_ops.log(y) * y))
 
   def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
     z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+    return -z - math_ops.exp(-z) - math_ops.log(self.scale)
 
   def _maybe_assert_valid_y(self, y):
     if not self.validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94..2bde956d1345129285acae4684256c5ac828b9a1 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -40,7 +40,7 @@ class Inline(bijector.Bijector):
     name="exp")
   ```
 
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  The above example is equivalent to the `Bijector` `Exp()`.
   """
 
   def __init__(self,
@@ -54,6 +54,8 @@ class Inline(bijector.Bijector):
                inverse_event_shape_tensor_fn=None,
                is_constant_jacobian=False,
                validate_args=False,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name="inline"):
     """Creates a `Bijector` from callables.
 
@@ -76,10 +78,15 @@ class Inline(bijector.Bijector):
         constant for all input arguments.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
+      forward_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
+      inverse_min_event_ndims: Python `int` indicating the minimal
+        dimensionality this bijector acts on.
       name: Python `str`, name given to ops managed by this object.
     """
     super(Inline, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=forward_min_event_ndims,
+        inverse_min_event_ndims=inverse_min_event_ndims,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -134,8 +141,8 @@ class Inline(bijector.Bijector):
           "inverse_log_det_jacobian_fn is not a callable function.")
     return self._inverse_log_det_jacobian_fn(y, **kwargs)
 
-  def _forward_log_det_jacobian(self, y, **kwargs):
+  def _forward_log_det_jacobian(self, x, **kwargs):
     if not callable(self._forward_log_det_jacobian_fn):
       raise NotImplementedError(
           "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
+    return self._forward_log_det_jacobian_fn(x, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index 2c603fe61f36dd27f4984fe6c13c11f2fb534321..1904239a0e7009c35cc4f3c8876fd749463a2b83 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -66,8 +66,9 @@ class Invert(bijector_lib.Bijector):
 
     self._bijector = bijector
     super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
         graph_parents=bijector.graph_parents,
+        forward_min_event_ndims=bijector.inverse_min_event_ndims,
+        inverse_min_event_ndims=bijector.forward_min_event_ndims,
         is_constant_jacobian=bijector.is_constant_jacobian,
         validate_args=validate_args,
         dtype=bijector.dtype,
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
index f5de052c9ed18b1ebf4c174aeea3a951b1ddcd9d..97000c17262d3efdef10274711364c2bc2083bd4 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/kumaraswamy.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -48,7 +47,6 @@ class Kumaraswamy(bijector.Bijector):
   def __init__(self,
                concentration1=None,
                concentration0=None,
-               event_ndims=0,
                validate_args=False,
                name="kumaraswamy"):
     """Instantiates the `Kumaraswamy` bijector.
@@ -60,31 +58,14 @@ class Kumaraswamy(bijector.Bijector):
       concentration0: Python `float` scalar indicating the transform power,
         i.e., `Y = g(X) = (1 - (1 - X)**(1 / b))**(1 / a)` where `b` is
         `concentration0`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution. Currently only zero is
-        supported.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
     """
     self._graph_parents = []
     self._name = name
     self._validate_args = validate_args
 
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
     with self._name_scope("init", values=[concentration1, concentration0]):
       concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
@@ -96,7 +77,7 @@ class Kumaraswamy(bijector.Bijector):
     self._concentration1 = concentration1
     self._concentration0 = concentration0
     super(Kumaraswamy, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -123,12 +104,10 @@ class Kumaraswamy(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
+    return (
         math_ops.log(self.concentration1) + math_ops.log(self.concentration0) +
         (self.concentration1 - 1) * math_ops.log(y) +
-        (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1),
-        axis=event_dims)
+        (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1))
 
   def _maybe_assert_valid_concentration(self, concentration, validate_args):
     """Checks the validity of a concentration parameter."""
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 84b2340c75514c3d2c12bf4d775ba74450a0dc26..ef56cf6ddda4dca2b1575e844b2584689e531b81 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -61,7 +61,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
   this property by zeroing out weights in its `masked_dense` layers.
 
   In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  `tf.contrib.distributions.bijectors.Bijector`. The `forward` "autoregression"
   is implemented using a `tf.while_loop` and a deep neural network (DNN) with
   masked weights such that the autoregressive property is automatically met in
   the `inverse`.
@@ -220,6 +220,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     self._unroll_loop = unroll_loop
     super(MaskedAutoregressiveFlow, self).__init__(
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index 8654cc39d0c41ec4f1b85cd5fc4366ceaf4b224d..4978167803fc38b112c95922519c8c296cee2561 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -114,6 +114,7 @@ class Permute(bijector_lib.Bijector):
         ], permutation)
       self._permutation = permutation
       super(Permute, self).__init__(
+          forward_min_event_ndims=1,
           is_constant_jacobian=True,
           validate_args=validate_args,
           name=name or "permute")
@@ -132,7 +133,10 @@ class Permute(bijector_lib.Bijector):
         axis=-1)
 
   def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
+    # is_constant_jacobian = True for this bijector, hence the
+    # `log_det_jacobian` need only be specified for a single input, as this will
+    # be tiled to match `event_ndims`.
+    return constant_op.constant(0., dtype=y.dtype.base_dtype)
 
   def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
+    return constant_op.constant(0., dtype=x.dtype.base_dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index c37db61720d10949f294ff7b2e9778ba6efa57f0..71f123f2a998458edaa9c8da07ea2932f62625ca 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -43,7 +43,6 @@ class PowerTransform(bijector.Bijector):
 
   def __init__(self,
                power=0.,
-               event_ndims=0,
                validate_args=False,
                name="power_transform"):
     """Instantiates the `PowerTransform` bijector.
@@ -51,8 +50,6 @@ class PowerTransform(bijector.Bijector):
     Args:
       power: Python `float` scalar indicating the transform power, i.e.,
         `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -70,7 +67,7 @@ class PowerTransform(bijector.Bijector):
       raise ValueError("`power` must be a non-negative TF constant.")
     self._power = power
     super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -97,18 +94,13 @@ class PowerTransform(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
+    return (self.power - 1.) * math_ops.log(y)
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
     if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
+      return x
+    return (1. / self.power - 1.) * math_ops.log1p(x * self.power)
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args or self.power == 0.:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index 71ab369d01aafc33854a2c2437f96bbb493cc6fb..f09ab21bce100e9dafb77eff1f3999ce4b71c681 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -166,7 +166,7 @@ class RealNVP(bijector_lib.Bijector):
     self._input_depth = None
     self._shift_and_log_scale_fn = shift_and_log_scale_fn
     super(RealNVP, self).__init__(
-        event_ndims=1,
+        forward_min_event_ndims=1,
         is_constant_jacobian=is_constant_jacobian,
         validate_args=validate_args,
         name=name)
@@ -224,7 +224,7 @@ class RealNVP(bijector_lib.Bijector):
     _, log_scale = self._shift_and_log_scale_fn(
         x0, self._input_depth - self._num_masked)
     if log_scale is None:
-      return constant_op.constant(0., dtype=x.dtype, name="ildj")
+      return constant_op.constant(0., dtype=x.dtype, name="fldj")
     return math_ops.reduce_sum(log_scale, axis=-1)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 55eca063126797d577653f0d6bcdfddf8192bdb5..f21b982ba664b312c716827c7925767a0b5a037a 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -128,15 +128,17 @@ class Reshape(bijector_lib.Bijector):
       self._event_shape_in = event_shape_in
       self._event_shape_out = event_shape_out
 
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
+      super(Reshape, self).__init__(
+          forward_min_event_ndims=0,
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "reshape")
 
   def _maybe_check_valid_shape(self, shape, validate_args):
     """Check that a shape Tensor is int-type and otherwise sane."""
     if not shape.dtype.is_integer:
       raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          shape.op.name, shape.dtype.name))
+          shape, shape.dtype.name))
 
     assertions = []
 
@@ -144,10 +146,10 @@ class Reshape(bijector_lib.Bijector):
     ndims_ = tensor_util.constant_value(ndims)
     if ndims_ is not None and ndims_ > 1:
       raise ValueError("`{}` rank ({}) should be <= 1.".format(
-          shape.op.name, ndims_))
+          shape, ndims_))
     elif validate_args:
       assertions.append(check_ops.assert_less_equal(
-          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape)))
 
     shape_ = tensor_util.constant_value_as_shape(shape)
     if shape_.is_fully_defined():
@@ -155,12 +157,12 @@ class Reshape(bijector_lib.Bijector):
       if sum(es == -1) > 1:
         raise ValueError(
             "`{}` must have at most one `-1` (given {})"
-            .format(shape.op.name, es))
+            .format(shape, es))
       if np.any(es < -1):
         raise ValueError(
             "`{}` elements must be either positive integers or `-1`"
             "(given {})."
-            .format(shape.op.name, es))
+            .format(shape, es))
     elif validate_args:
       assertions.extend([
           check_ops.assert_less_equal(
@@ -168,11 +170,11 @@ class Reshape(bijector_lib.Bijector):
                   math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
               1,
               message="`{}` elements must have at most one `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
           check_ops.assert_greater_equal(
               shape, -1,
               message="`{}` elements must be either positive integers or `-1`."
-              .format(shape.op.name)),
+              .format(shape)),
       ])
     return assertions
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index a640dfe7dfbcce96261589c7fc49107deaefdd54..5df8c886315ff75cdc884e3b9b4665fb64bb109d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -33,7 +33,9 @@ class Sigmoid(bijector.Bijector):
 
   def __init__(self, validate_args=False, name="sigmoid"):
     super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   def _forward(self, x):
     return math_ops.sigmoid(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index 3a75e4ae9495793901b0da91a5aa3982aab35852..2a32e8abcde940b0056b0faf2955ec1b3bd71803 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -91,7 +91,6 @@ class SinhArcsinh(bijector.Bijector):
   def __init__(self,
                skewness=None,
                tailweight=None,
-               event_ndims=0,
                validate_args=False,
                name="SinhArcsinh"):
     """Instantiates the `SinhArcsinh` bijector.
@@ -101,8 +100,6 @@ class SinhArcsinh(bijector.Bijector):
         of type `float32`.
       tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
         `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -125,7 +122,9 @@ class SinhArcsinh(bijector.Bijector):
                 message="Argument tailweight was not positive")
         ], self._tailweight)
     super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
 
   @property
   def skewness(self):
@@ -149,31 +148,29 @@ class SinhArcsinh(bijector.Bijector):
     # dx/dy
     # = cosh(arcsinh(y) / tailweight - skewness)
     #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             math_ops.asinh(y) / self.tailweight - self.skewness)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
                      / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
+        - math_ops.log(self.tailweight))
 
   def _forward_log_det_jacobian(self, x):
     # y = sinh((arcsinh(x) + skewness) * tailweight)
     # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
     # dy/dx
     # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+
+    # This is computed inside the log to avoid catastrophic cancellations
+    # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+    return (
         math_ops.log(math_ops.cosh(
             (math_ops.asinh(x) + self.skewness) * self.tailweight)
                      # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
                      # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
                      / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
+        + math_ops.log(self.tailweight))
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index dc94fd0a38de29f5a7ee6ca826aab0ecf8712966..f52b91550edff7390d8094a4508d862674e85d59 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -66,7 +66,7 @@ class SoftmaxCentered(bijector.Bijector):
     self._graph_parents = []
     self._name = name
     super(SoftmaxCentered, self).__init__(
-        event_ndims=1,
+        forward_min_event_ndims=1,
         validate_args=validate_args,
         name=name)
 
@@ -105,8 +105,6 @@ class SoftmaxCentered(bijector.Bijector):
       y.shape.assert_is_compatible_with(shape)
       y.set_shape(shape)
 
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
     return nn_ops.softmax(y)
 
   def _inverse(self, y):
@@ -162,8 +160,6 @@ class SoftmaxCentered(bijector.Bijector):
     #   -log_normalization + reduce_sum(logits - log_normalization)
     log_normalization = nn_ops.softplus(
         math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-    fldj = (-log_normalization +
-            math_ops.reduce_sum(x - log_normalization,
-                                axis=-1,
-                                keep_dims=True))
-    return array_ops.squeeze(fldj, squeeze_dims=-1)
+    return array_ops.squeeze(
+        (-log_normalization + math_ops.reduce_sum(
+            x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 81957fcf78922fa15fd20a25d144071f431161ae..96a938c803418ff818f9c531754b47ba1eb8667a 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -62,7 +62,7 @@ class Softplus(bijector.Bijector):
     ```python
     # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
     # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
+    softplus = Softplus()
     x = [[[1., 2],
           [3, 4]],
          [[5, 6],
@@ -81,7 +81,6 @@ class Softplus(bijector.Bijector):
               "Nonzero floating point `Tensor`.  Controls the softness of what "
               "would otherwise be a kink at the origin.  Default is 1.0")})
   def __init__(self,
-               event_ndims=0,
                hinge_softness=None,
                validate_args=False,
                name="softplus"):
@@ -101,7 +100,7 @@ class Softplus(bijector.Bijector):
             [nonzero_check], self.hinge_softness)
 
     super(Softplus, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -130,14 +129,12 @@ class Softplus(bijector.Bijector):
     # 1 - exp{-Y} approx Y.
     if self.hinge_softness is not None:
       y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
+    return -math_ops.log(-math_ops.expm1(-y))
 
   def _forward_log_det_jacobian(self, x):
     if self.hinge_softness is not None:
       x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
+    return -nn_ops.softplus(-x)
 
   @property
   def hinge_softness(self):
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4a658c171b8313358754228aabbfa4bf93fd84d
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softsign.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Softsign bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Softsign",
+]
+
+
+class Softsign(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = X / (1 + |X|)`.
+
+  The softsign `Bijector` has the following two useful properties:
+
+  * The domain is all real numbers
+  * `softsign(x) approx sgn(x)`, for large `|x|`.
+
+  #### Examples
+
+  ```python
+  # Create the Y = softsign(X) transform.
+  softsign = Softsign()
+  x = [[[1., 2],
+        [3, 4]],
+       [[5, 6],
+        [7, 8]]]
+  x / (1 + abs(x)) == softsign.forward(x)
+  x / (1 - abs(x)) == softsign.inverse(x)
+  ```
+  """
+
+  def __init__(self, validate_args=False, name="softsign"):
+    super(Softsign, self).__init__(
+        forward_min_event_ndims=0,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    return x / (1. + math_ops.abs(x))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return y / (1. - math_ops.abs(y))
+
+  def _forward_log_det_jacobian(self, x):
+    return -2. * math_ops.log1p(math_ops.abs(x))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return -2. * math_ops.log1p(-math_ops.abs(y))
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = [
+        check_ops.assert_greater(
+            y, math_ops.cast(-1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be greater than -1."),
+        check_ops.assert_less(
+            y, math_ops.cast(1., dtype=y.dtype.base_dtype),
+            message="Inverse transformation input must be less than 1.")
+    ]
+
+    return control_flow_ops.with_dependencies(is_valid, y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/square.py b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
index 1e9dbf35091fe51f2478dc085c394a77295ca4ee..2ccfdc95970e387e708603e2614ad29fb6a18db3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/square.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/square.py
@@ -59,7 +59,7 @@ class Square(bijector.Bijector):
     """
     self._name = name
     super(Square, self).__init__(
-        event_ndims=0,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index 00520bcda85e9527767e6342bf75f10667c264a8..39129cd22cdbf9ca1b4edd7cb5c3571a33837a29 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -50,7 +50,6 @@ class Weibull(bijector.Bijector):
   def __init__(self,
                scale=1.,
                concentration=1.,
-               event_ndims=0,
                validate_args=False,
                name="weibull"):
     """Instantiates the `Weibull` bijector.
@@ -62,8 +61,6 @@ class Weibull(bijector.Bijector):
       concentration: Positive Float-type `Tensor` that is the same dtype and is
         broadcastable with `scale`.
         This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -89,7 +86,7 @@ class Weibull(bijector.Bijector):
         ], self._concentration)
 
     super(Weibull, self).__init__(
-        event_ndims=event_ndims,
+        forward_min_event_ndims=0,
         validate_args=validate_args,
         name=name)
 
@@ -113,22 +110,18 @@ class Weibull(bijector.Bijector):
 
   def _inverse_log_det_jacobian(self, y):
     y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
+    return (
         -math_ops.log1p(-y) +
         (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
+        math_ops.log(self.scale / self.concentration))
 
   def _forward_log_det_jacobian(self, x):
     x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
+    return (
         -(x / self.scale) ** self.concentration +
         (self.concentration - 1) * math_ops.log(x) +
         math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
+        -self.concentration * math_ops.log(self.scale))
 
   def _maybe_assert_valid_x(self, x):
     if not self.validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 1d4c5660d8d73b7b6a7e758fc834ccfddeb5c8ea..10b45361358b40a3c8fd725f27ad84ef9b8a37f5 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import transformed_distribution
@@ -105,7 +106,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_log_prob_for_one_fiber(y, x, ildj,
                                                  distribution_kwargs)
@@ -128,7 +131,9 @@ class ConditionalTransformedDistribution(
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    ildj = self.bijector.inverse_log_det_jacobian(y, **bijector_kwargs)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(
+        y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_prob_for_one_fiber(y, x, ildj, distribution_kwargs)
 
@@ -214,3 +219,15 @@ class ConditionalTransformedDistribution(
     # implies the qth quantile of Y is g(x_q).
     inv_cdf = self.distribution.quantile(value, **distribution_kwargs)
     return self.bijector.forward(inv_cdf, **bijector_kwargs)
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 92f2bba1828696248c9d9460566a08ba372c3358..3314181898870fa70dac3dfce42ba84de3d82a4a 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -114,7 +114,7 @@ def quadrature_scheme_lognormal_quantiles(
     # Create a LogNormal distribution.
     dist = transformed_lib.TransformedDistribution(
         distribution=normal_lib.Normal(loc=loc, scale=scale),
-        bijector=Exp(event_ndims=0),
+        bijector=Exp(),
         validate_args=validate_args)
     batch_ndims = dist.batch_shape.ndims
     if batch_ndims is None:
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index f56ba0781604cb5a4fb3070b79aa86e09ceb6766..02cf3c7992dc8cde3869ac9f12e7b4372cd6ea2c 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -409,5 +409,5 @@ class RelaxedOneHotCategorical(
                                        validate_args=validate_args,
                                        allow_nan_stats=allow_nan_stats)
     super(RelaxedOneHotCategorical, self).__init__(dist,
-                                                   bijectors.Exp(event_ndims=1),
+                                                   bijectors.Exp(),
                                                    name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index 0d8a1926913766da374cb65767dccfa28bf75579..cde6d855009ff45129f603de1462f60b828e661f 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -166,13 +166,13 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
 
       # Make the SAS bijector, 'F'.
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=0)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
       # Make the AffineScalar bijector, Z --> loc + scale * Z (2 / F_0(2))
       c = 2 * scale / f_noskew.forward(ops.convert_to_tensor(2, dtype=dtype))
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index 5c52015e5fd5d7f3cc49e337520296e58ea08bfb..9c69435fac109914ff29b307dfad105f62849339 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -130,7 +130,7 @@ import itertools
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -169,31 +169,27 @@ def _do_maximum_mean(samples, envelope, high, name=None):
     samples = array_ops.transpose(samples, perm)
 
     samples = _batch_sort_vector(samples)
-    batch_shape = array_ops.shape(samples)[:-1]
-    n = array_ops.shape(samples)[-1]
-    step = 1. / math_ops.cast(n, dtype=samples.dtype.base_dtype)
-
-    def _loop_body(iter_, total, to_skip):
-      total = array_ops.where(
-          step <= to_skip,
-          total,
-          array_ops.where(
-              to_skip > 0.,
-              total + (step - to_skip) * samples[..., iter_],
-              total + step * samples[..., iter_]))
-      to_skip = array_ops.where(step <= to_skip, to_skip - step, 0.)
-      return [iter_ + 1, total, to_skip]
-
-    _, total, _ = control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < n,
-        body=_loop_body,
-        loop_vars=[
-            0,
-            array_ops.zeros(batch_shape, dtype=samples.dtype.base_dtype),
-            envelope,  # to_skip
-        ])
-
-  return total + envelope * high
+
+    # The maximum mean is given by taking `envelope`-worth of
+    # probability from the smallest samples and moving it to the
+    # maximum value.  This amounts to:
+    # - ignoring the smallest k samples, where `k/n < envelope`
+    # - taking a `1/n - (envelope - k/n)` part of the index k sample
+    # - taking all the other samples
+    # - and adding `envelope * high` at the end.
+    # The following is a vectorized and batched way of computing this.
+    # `max_mean_contrib` is a mask implementing the previous.
+    batch_size = array_ops.shape(samples)[-1]
+    batch_size = math_ops.cast(batch_size, dtype=samples.dtype.base_dtype)
+    step = 1. / batch_size
+    cum_steps = step * math_ops.range(
+        1, batch_size + 1, dtype=samples.dtype.base_dtype)
+    max_mean_contrib = clip_ops.clip_by_value(
+        cum_steps - envelope[..., array_ops.newaxis],
+        clip_value_min=0.,
+        clip_value_max=step)
+    return math_ops.reduce_sum(
+        samples * max_mean_contrib, axis=-1) + envelope * high
 
 
 def _maximum_mean(samples, envelope, high, name=None):
@@ -234,7 +230,7 @@ def _maximum_mean(samples, envelope, high, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     high = ops.convert_to_tensor(high, name="high")
 
-    xmax = math_ops.reduce_max(samples, axis=[-1])
+    xmax = math_ops.reduce_max(samples, axis=[0])
     msg = "Given sample maximum value exceeds expectations"
     check_op = check_ops.assert_less_equal(xmax, high, message=msg)
     with ops.control_dependencies([check_op]):
@@ -279,7 +275,7 @@ def _minimum_mean(samples, envelope, low, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     low = ops.convert_to_tensor(low, name="low")
 
-    xmin = math_ops.reduce_min(samples, axis=[-1])
+    xmin = math_ops.reduce_min(samples, axis=[0])
     msg = "Given sample minimum value falls below expectations"
     check_op = check_ops.assert_greater_equal(xmin, low, message=msg)
     with ops.control_dependencies([check_op]):
@@ -319,8 +315,8 @@ def _dkwm_cdf_envelope(n, error_rate, name=None):
     return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
 
 
-def _check_shape_dominates(tensor, tensors):
-  """Check that broadcasting `tensor` against `tensors` does not expand it.
+def _check_shape_dominates(samples, parameters):
+  """Check that broadcasting `samples` against `parameters` does not expand it.
 
   Why?  Because I want to be very sure that the samples tensor is not
   accidentally enlarged by broadcasting against tensors that are
@@ -328,24 +324,27 @@ def _check_shape_dominates(tensor, tensors):
   sample counts end up inflated.
 
   Args:
-    tensor: A Tensor whose shape is to be protected against broadcasting.
-    tensors: A list of Tensors to check
+    samples: A Tensor whose shape is to be protected against broadcasting.
+    parameters: A list of Tensors who are parameters for the statistical test.
 
   Returns:
-    tensor: `tf.identity(tensor)` with control dependencies attached;
-      be sure to use that downstream.
+    samples: Return original `samples` with control dependencies attached
+      to ensure no broadcasting.
   """
   def check(t):
-    target = array_ops.shape(tensor)[1:]
-    result = array_ops.broadcast_dynamic_shape(target, array_ops.shape(t))
+    samples_batch_shape = array_ops.shape(samples)[1:]
+    broadcasted_batch_shape = array_ops.broadcast_dynamic_shape(
+        samples_batch_shape, array_ops.shape(t))
     # This rank check ensures that I don't get a wrong answer from the
     # _shapes_ broadcasting against each other.
-    gt = check_ops.assert_greater(array_ops.rank(target), array_ops.rank(t))
-    eq = check_ops.assert_equal(target, result)
-    return gt, eq
-  checks = list(itertools.chain(*[check(t) for t in tensors]))
+    samples_batch_ndims = array_ops.size(samples_batch_shape)
+    ge = check_ops.assert_greater_equal(
+        samples_batch_ndims, array_ops.rank(t))
+    eq = check_ops.assert_equal(samples_batch_shape, broadcasted_batch_shape)
+    return ge, eq
+  checks = list(itertools.chain(*[check(t) for t in parameters]))
   with ops.control_dependencies(checks):
-    return array_ops.identity(array_ops.identity(tensor))
+    return array_ops.identity(samples)
 
 
 def true_mean_confidence_interval_by_dkwm(
@@ -684,9 +683,13 @@ def assert_true_mean_equal_by_dkwm_two_sample(
       # I want to assert
       #   not (max_mean_1 < min_mean_2 or min_mean_1 > max_mean_2),
       # but I think I only have and-combination of asserts, so use DeMorgan.
-      clause1_op = check_ops.assert_greater_equal(max_mean_1, min_mean_2)
-      with ops.control_dependencies([clause1_op]):
-        return check_ops.assert_less_equal(min_mean_1, max_mean_2)
+      check_confidence_intervals_can_intersect = check_ops.assert_greater_equal(
+          max_mean_1, min_mean_2, message="Confidence intervals do not "
+          "intersect: samples1 has a smaller mean than samples2")
+      with ops.control_dependencies([check_confidence_intervals_can_intersect]):
+        return check_ops.assert_less_equal(
+            min_mean_1, max_mean_2, message="Confidence intervals do not "
+            "intersect: samples2 has a smaller mean than samples1")
 
 
 def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 971d65c4a69140161461fdac93bb588014dd3e88..da271a852d715cd4bc3423b23e8a597b116027f0 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -427,7 +427,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._endpoint_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="endpoint_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(loc, scale))]
@@ -467,7 +466,6 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       self._interpolated_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
-                               event_ndims=1,
                                validate_args=validate_args,
                                name="interpolated_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(
@@ -621,9 +619,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     log_prob = math_ops.reduce_sum(self.distribution.log_prob(y), axis=-2)
     # Because the affine transformation has a constant Jacobian, it is the case
     # that `affine.fldj(x) = -affine.ildj(x)`. This is not true in general.
-    fldj = array_ops.stack(
-        [aff.forward_log_det_jacobian(x) for aff in self.interpolated_affine],
-        axis=-1)
+    fldj = array_ops.stack([
+        aff.forward_log_det_jacobian(
+            x,
+            event_ndims=array_ops.rank(self.event_shape_tensor())
+        ) for aff in self.interpolated_affine], axis=-1)
     return math_ops.reduce_logsumexp(
         self.mixture_distribution.logits - fldj + log_prob, axis=-1)
 
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 003c66b9413fdcad20fbcc8b4bf47259692932e7..05919be124e8fbfe29e8111a0637db072830ff61 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -215,13 +215,13 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
       tailweight = ops.convert_to_tensor(
           tailweight, dtype=dtype, name="tailweight")
       f = bijectors.SinhArcsinh(
-          skewness=skewness, tailweight=tailweight, event_ndims=1)
+          skewness=skewness, tailweight=tailweight)
       if has_default_skewness:
         f_noskew = f
       else:
         f_noskew = bijectors.SinhArcsinh(
             skewness=skewness.dtype.as_numpy_dtype(0.),
-            tailweight=tailweight, event_ndims=0)
+            tailweight=tailweight)
 
       # Make the Affine bijector, Z --> loc + C * Z.
       c = 2 * scale_diag_part / f_noskew.forward(
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
deleted file mode 100644
index b016d2dcb504044372c895e1eedf3511751bc13e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/proto/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-
-tf_proto_library(
-    name = "checkpointable_object_graph_proto",
-    srcs = [
-        "checkpointable_object_graph.proto",
-    ],
-    visibility = ["//tensorflow/contrib/eager/python:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index edb9130266e4ea93d2ec6ee373a90df504da18cf..e2744a430d1efe4b4a688dc7c5caff0bf83de358 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -71,7 +71,9 @@ cuda_py_test(
     additional_deps = [
         ":datasets",
         ":checkpointable_utils",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/contrib/data/python/ops:threadpool",
+        "//tensorflow/contrib/data/python/ops:unique",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -120,13 +122,13 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/eager/python:checkpointable_utils",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -140,11 +142,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/summary:summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -161,10 +163,10 @@ py_library(
     deps = [
         ":datasets",
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "@six_archive//:six",
@@ -230,21 +232,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -272,8 +261,7 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     tags = [
-        "no_oss",  # b/74395663
         "no_windows",  # TODO: needs investigation on Windows
-        "notsan",
+        "notsan",  # b/74395663
     ],
 )
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
index 34cb8d0e0887bd5e440873bae117bf27597de11b..30c4103c5aa52a74bcc8f72c7e1df186c9f7f591 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils.py
@@ -17,857 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import collections
 import functools
-import weakref
 
-from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable as core_checkpointable
-from tensorflow.python.training import checkpointable_utils as core_checkpointable_utils
-from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import deprecation
-
-
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-# Key where the object graph proto is saved in a TensorBundle
-_OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
-
-
-# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
-# or consolidating the implementation with get_variable.
-def _default_getter(name, shape, dtype, initializer=None,
-                    partition_info=None, **kwargs):
-  """A pared-down version of get_variable which does not reuse variables."""
-  dtype = dtypes.as_dtype(dtype)
-  shape_object = tensor_shape.as_shape(shape)
-  with ops.init_scope():
-    if initializer is None:
-      initializer, initializing_from_value = (
-          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
-              name=name, shape=shape_object, dtype=dtype))
-    else:
-      initializing_from_value = not callable(initializer)
-    # Same logic as get_variable
-    variable_dtype = dtype.base_dtype
-    if initializing_from_value:
-      if shape is not None:
-        raise ValueError("If initializer is a constant, do not specify shape.")
-      initial_value = initializer
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      def initial_value():
-        return initializer(
-            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
-    return resource_variable_ops.ResourceVariable(
-        initial_value=initial_value,
-        name=name,
-        dtype=variable_dtype,
-        **kwargs
-    )
-
-
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
-                 initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
-      name=name, shape=shape, dtype=dtype,
-      initializer=initializer, getter=_default_getter)
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = {root_checkpointable: ()}
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = {}
-  for checkpointable in non_slot_objects:
-    if isinstance(checkpointable, optimizer_lib.Optimizer):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except AttributeError:
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .Object.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = {}
-
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    object_name = object_names[checkpointable]
-    for name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if callable(saveable_factory):
-        saveable = saveable_factory(name=attribute.checkpoint_key)
-      else:
-        saveable = saveable_factory
-      # Figure out the name-based Saver's name for this variable.
-      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [saveable], convert_variable_to_tensor=False)
-      attribute.full_name, = saver_dict.keys()
-      named_saveables[attribute.checkpoint_key] = saveable
-
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-
-  return named_saveables, object_graph_proto
-
-
-def _serialize_object_graph(root_checkpointable):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = {
-      obj: _object_prefix_from_path(path)
-      for obj, path in path_to_root.items()}
-  node_ids = {node: node_id for node_id, node
-              in enumerate(checkpointable_objects)}
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return _serialize_checkpointables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names,
-      slot_variables=slot_variables)
-
-
-def gather_initializers(root_checkpointable):
-  """Traverse the object graph and find initialization ops.
-
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
-  initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
-  saved with a checkpoint).
-
-  Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
-  Returns:
-    A list of initialization ops.
-  """
-  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
-  # to run.
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = {
-      obj: _object_prefix_from_path(path)
-      for obj, path in path_to_root.items()}
-  node_ids = {node: node_id for node_id, node
-              in enumerate(checkpointable_objects)}
-  _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return [c.initializer for c in checkpointable_objects
-          if hasattr(c, "initializer") and c.initializer is not None]
-
-
-class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
-
-  def __init__(self, tensor, name):
-    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
-    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    return control_flow_ops.no_op()
-
-
-class _LoadStatus(object):
-  """Abstract base for load status callbacks."""
-
-  @abc.abstractmethod
-  def assert_consumed(self):
-    """Raises an exception unless a non-trivial restoration has completed."""
-    pass
-
-  @abc.abstractmethod
-  def run_restore_ops(self, session=None):
-    """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
-    pass
-
-  @abc.abstractmethod
-  def initialize_or_restore(self, session=None):
-    """Runs restore ops from the checkpoint, or initializes variables."""
-    pass
-
-
-class CheckpointLoadStatus(_LoadStatus):
-  """Checks the status of checkpoint loading and manages restore ops.
-
-  Returned from `Saver.restore`. Since `restore` may defer the loading of values
-  in the checkpoint which don't yet have corresponding Python objects,
-  `CheckpointLoadStatus` provides a callback to verify that checkpoint loading
-  is complete (`assert_consumed`).
-
-  When graph building, `restore` does not run restore ops itself since their
-  creation may be deferred. The `run_restore_ops` method must be called once all
-  Python objects with values to restore have been created and added to the
-  dependency graph (this does not necessarily have to be the whole checkpoint;
-  calling `run_restore_ops` while `assert_consumed` fails is supported and will
-  partially restore the checkpoint).
-
-  See `Saver.restore` for usage examples.
-  """
-
-  def __init__(self, checkpoint, feed_dict):
-    self._checkpoint = checkpoint
-    self._feed_dict = feed_dict
-
-  def assert_consumed(self):
-    """Asserts that all objects in the checkpoint have been created/matched.
-
-    Returns:
-      `self` for chaining.
-    Raises:
-      AssertionError: If there are any Python objects in the dependency graph
-        which have not been restored from this checkpoint or a later `restore`,
-        or if there are any checkpointed values which have not been matched to
-        Python objects.
-    """
-    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
-        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
-      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
-        raise AssertionError(
-            "Object not assigned a value from checkpoint: %s" % (node,))
-    if self._checkpoint.slot_restorations:
-      # Sanity check; this collection should be clear if everything has been
-      # restored.
-      raise AssertionError("Unresolved slot restorations: %s" % (
-          self._checkpoint.slot_restorations,))
-    if self._checkpoint.unused_attributes:
-      raise AssertionError(
-          ("Unused attributes in these objects (the attributes exist in the "
-           "checkpoint but not in the objects): %s") % (
-               self._checkpoint.unused_attributes.items(),))
-    return self
-
-  def run_restore_ops(self, session=None):
-    """Run operations to restore objects in the dependency graph."""
-    if context.executing_eagerly():
-      return  # Run eagerly
-    if session is None:
-      session = ops.get_default_session()
-    session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
-
-  def initialize_or_restore(self, session=None):
-    """Alias for `run_restore_ops`.
-
-    This method has a sibling in `InitializationOnlyStatus` which instead
-    initializes variables. That type is returned if no checkpoint is specified
-    in `Saver.restore`.
-
-    Args:
-      session: The session to run restore ops in. If `None`, uses the default
-        session.
-    """
-    self.run_restore_ops(session=session)
-
-
-class InitializationOnlyStatus(_LoadStatus):
-  """Returned from `Saver.restore` when no checkpoint has been specified.
-
-  Objects of this type have the same `assert_consumed` method as
-  `CheckpointLoadStatus`, but it always fails. However,
-  `initialize_or_restore` works on objects of both types, and will
-  initialize variables in `InitializationOnlyStatus` objects or restore them
-  otherwise.
-  """
-
-  def __init__(self, root_checkpointable):
-    self._root_checkpointable = root_checkpointable
-
-  def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "No checkpoint specified (save_path=None); nothing is being restored.")
-
-  def run_restore_ops(self, session=None):
-    """For consistency with `CheckpointLoadStatus`.
-
-    Use `initialize_or_restore` for initializing if no checkpoint was passed
-    to `Saver.restore` and restoring otherwise.
-
-    Args:
-      session: Not used.
-    """
-    raise AssertionError(
-        "No checkpoint specified, so no restore ops are available "
-        "(save_path=None to Saver.restore).")
-
-  def initialize_or_restore(self, session=None):
-    """Runs initialization ops for variables.
-
-    Only objects which would be saved by `Saver.save` will be initialized. See
-    `gather_initializers` for details.
-
-    This method does nothing when executing eagerly (initializers get run
-    eagerly).
-
-    Args:
-      session: The session to run initialization ops in. If `None`, uses the
-        default session.
-    """
-    if context.executing_eagerly():
-      return  # run eagerly
-    if session is None:
-      session = ops.get_default_session()
-    session.run(gather_initializers(self._root_checkpointable))
-
-
-_DEPRECATED_RESTORE_INSTRUCTIONS = (
-    "Restoring a name-based tf.train.Saver checkpoint using the object-based "
-    "restore API. This mode uses global names to match variables, and so is "
-    "somewhat fragile. It also adds new restore ops to the graph each time it "
-    "is called. Prefer re-encoding training checkpoints in the object-based "
-    "format: run save() on the object-based saver (the same one this message "
-    "is coming from) and use that checkpoint in the future.")
-
-
-class NameBasedSaverStatus(_LoadStatus):
-  """Status for loading a name-based training checkpoint."""
-
-  def __init__(self, object_saver, save_path):
-    self._object_saver = object_saver
-    self._save_path = save_path
-
-  def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "Restoring a name-based checkpoint. No load status is available.")
-
-  @deprecation.deprecated(
-      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
-  def run_restore_ops(self, session=None):
-    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
-    if session is None and not context.executing_eagerly():
-      session = ops.get_default_session()
-    with ops.device("/cpu:0"):
-      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
-          sess=session, save_path=self._save_path)
-
-  def initialize_or_restore(self, session=None):
-    """Alias for `run_restore_ops`."""
-    self.run_restore_ops(session=session)
-
-
-class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
-  """Pretends to be a session, inserts extra feeds on run()."""
-
-  def __init__(self, session, feed_additions):
-    self._wrapped_session = session
-    self._feed_additions = feed_additions
-
-  def run(self, fetches, feed_dict=None, **kwargs):
-    if feed_dict is None:
-      feed_dict = {}
-    else:
-      feed_dict = feed_dict.copy()
-    feed_dict.update(self._feed_additions)
-    return self._wrapped_session.run(
-        fetches=fetches, feed_dict=feed_dict, **kwargs)
-
-
-def _copy_saver_with_new_var_list(old_saver, new_var_list):
-  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list)
-  # TODO(allenl): Move to copying functionality to Saver?
-  # pylint: disable=protected-access
-  new_saver._last_checkpoints = old_saver._last_checkpoints
-  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
-  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
-  # pylint: enable=protected-access
-  return new_saver
-
-
-class CheckpointableSaver(object):
-  """Saves and restores a `Checkpointable` object and its dependencies.
-
-  See `Checkpointable` for details of dependency management. `Saver` wraps
-  `tf.train.Saver` for saving, including extra information about the graph of
-  dependencies between Python objects. When restoring, it uses this information
-  about the save-time dependency graph to more robustly match objects with their
-  checkpointed values. When executing eagerly, it supports restoring variables
-  on object creation (see `Saver.restore`).
-
-  Values in a checkpoint are mapped to `Checkpointable` Python objects
-  (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
-  checkpoint was written. To avoid breaking existing checkpoints when modifying
-  a class, dependency names (the names of attributes to which `Checkpointable`
-  objects are assigned) may not change. These names are local to objects, in
-  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
-  so allow additional program transformations.
-  """
-
-  def __init__(self, root_checkpointable):
-    """Configure saving.
-
-    Args:
-      root_checkpointable: The root of the object graph to save/restore. This
-        object and all of its dependencies are saved in the checkpoint. When
-        restoring, objects are matched and restored starting from this root.
-    """
-    # Allow passing in a weak reference to avoid reference cycles when
-    # `Checkpointable` objects save themselves.
-    self._root_checkpointable_ref = root_checkpointable
-    if not context.executing_eagerly():
-      with ops.device("/cpu:0"):
-        self._file_prefix_placeholder = constant_op.constant("model")
-    else:
-      self._file_prefix_placeholder = None
-
-    # Op caching for save
-    self._object_graph_feed_tensor = None
-    self._last_save_object_graph = None
-    self._last_save_saver = None
-
-    # Op caching for restore
-    self._last_restore_object_graph = None
-    self._last_restore_checkpoint = None
-
-  @property
-  def _root_checkpointable(self):
-    if isinstance(self._root_checkpointable_ref, weakref.ref):
-      derefed = self._root_checkpointable_ref()
-      assert derefed is not None
-      return derefed
-    else:
-      return self._root_checkpointable_ref
-
-  def save(self, file_prefix, checkpoint_number=None, session=None):
-    """Save a training checkpoint.
-
-    The saved checkpoint includes variables created by this object and any
-    Checkpointable objects it depends on at the time `Saver.save()` is called.
-
-    Args:
-      file_prefix: A prefix to use for the checkpoint filenames
-        (/path/to/directory/and_a_prefix). Names are generated based on this
-        prefix and `checkpoint_number`, if provided.
-      checkpoint_number: An integer variable or Tensor, used to number
-        checkpoints. Typically this value is saved along with other variables in
-        training checkpoints, which will happen automatically if it was created
-        by `root_checkpointable` or one of its dependencies (via
-        `Checkpointable._add_variable`).
-      session: The session to evaluate variables in. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
-
-    Returns:
-      The full path to the checkpoint.
-    """
-    named_variables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    if not context.executing_eagerly():
-      if session is None:
-        session = ops.get_default_session()
-      if self._object_graph_feed_tensor is None:
-        with ops.device("/cpu:0"):
-          self._object_graph_feed_tensor = constant_op.constant(
-              "", dtype=dtypes.string)
-      object_graph_tensor = self._object_graph_feed_tensor
-      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
-    else:
-      session = None
-      with ops.device("/cpu:0"):
-        object_graph_tensor = constant_op.constant(
-            graph_proto.SerializeToString(), dtype=dtypes.string)
-      feed_additions = None
-    assert _OBJECT_GRAPH_PROTO_KEY not in named_variables
-    named_variables[_OBJECT_GRAPH_PROTO_KEY] = _NoRestoreSaveable(
-        tensor=object_graph_tensor,
-        name=_OBJECT_GRAPH_PROTO_KEY)
-    if (self._last_save_object_graph != graph_proto
-        # When executing eagerly, we need to re-create SaveableObjects each time
-        # save() is called so they pick up new Tensors passed to their
-        # constructors. That means the Saver needs to be copied with a new
-        # var_list.
-        or context.executing_eagerly()):
-      if self._last_save_object_graph is not None:
-        self._last_save_saver = _copy_saver_with_new_var_list(
-            old_saver=self._last_save_saver, new_var_list=named_variables)
-      else:
-        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
-      self._last_save_object_graph = graph_proto
-    with ops.device("/cpu:0"):
-      save_path = self._last_save_saver.save(
-          sess=_SessionWithFeedDictAdditions(
-              session=session, feed_additions=feed_additions),
-          save_path=file_prefix,
-          write_meta_graph=False,
-          global_step=checkpoint_number)
-    return save_path
-
-  def _global_variable_names(self):
-    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
-    named_saveables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    saver_names = {}
-    for object_proto in graph_proto.nodes:
-      for attribute_proto in object_proto.attributes:
-        saver_names[attribute_proto.full_name] = named_saveables[
-            attribute_proto.checkpoint_key]
-    return saver_names
-
-  def restore(self, save_path):
-    """Restore a training checkpoint.
-
-    Restores `root_checkpointable` and any objects that it tracks
-    (transitive). Either assigns values immediately if variables to restore have
-    been created already, or defers restoration until the variables are
-    created. Dependencies added to the `root_checkpointable` passed to the
-    constructor after this call will be matched if they have a corresponding
-    object in the checkpoint.
-
-    When building a graph, restorations are added to the graph but not run.
-
-    To disallow deferred loading, assert immediately that all checkpointed
-    variables have been matched to variable objects:
-
-    ```python
-    saver = Saver(root)
-    saver.restore(path).assert_consumed()
-    ```
-
-    An exception will be raised unless every object was matched and its
-    variables already exist.
-
-    When graph building, `assert_consumed()` indicates that all of the restore
-    ops which will be created for this checkpoint have been created. They can be
-    run via the `run_restore_ops()` function of the status object:
-
-    ```python
-    saver.restore(path).assert_consumed().run_restore_ops()
-    ```
-
-    If the checkpoint has not been consumed completely, then the list of restore
-    ops will grow as more objects are added to the dependency graph.
-
-    Name-based `tf.train.Saver` checkpoints can be loaded using this
-    method. There is no deferred loading, and names are used to match
-    variables. No restore ops are created/run until `run_restore_ops()` or
-    `initialize_or_restore()` are called on the returned status object, even
-    when executing eagerly. Re-encode name-based checkpoints using this
-    object-based `Saver.save` as soon as possible.
-
-    Args:
-      save_path: The path to the checkpoint, as returned by `save` or
-        `tf.train.latest_checkpoint`. If None (as when there is no latest
-        checkpoint for `tf.train.latest_checkpoint` to return), returns an
-        object which may run initializers for objects in the dependency
-        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
-        names are used to match variables.
-
-    Returns:
-      A load status object, which can be used to make assertions about the
-      status of checkpoint restoration and run initialization/restore ops
-      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
-      `save_path` is `None`).
-
-      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
-      object is returned which runs restore ops from a name-based saver.
-    """
-    if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable)
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
-      file_prefix_tensor = self._file_prefix_placeholder
-      file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
-    else:
-      with ops.device("/cpu:0"):
-        file_prefix_tensor = constant_op.constant(save_path)
-      file_prefix_feed_dict = None
-    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-    try:
-      object_graph_string = reader.get_tensor(_OBJECT_GRAPH_PROTO_KEY)
-    except errors_impl.NotFoundError:
-      # The object graph proto does not exist in this checkpoint. Try again with
-      # name-based saving.
-      return NameBasedSaverStatus(self, save_path)
-
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-    object_graph_proto.ParseFromString(object_graph_string)
-    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
-      checkpoint = self._last_restore_checkpoint
-    else:
-      if in_graph_mode:
-        dtype_map = None
-      else:
-        dtype_map = reader.get_variable_to_dtype_map()
-      checkpoint = core_checkpointable_utils._Checkpoint(  # pylint: disable=protected-access
-          object_graph_proto=object_graph_proto,
-          save_path=file_prefix_tensor,
-          dtype_map=dtype_map)
-      if in_graph_mode:
-        if self._last_restore_object_graph is not None:
-          raise NotImplementedError(
-              "Using a single Saver to restore different object graphs is not "
-              "currently supported when graph building. Use a different Saver "
-              "for each object graph (restore ops will be duplicated), or "
-              "file a feature request if this limitation bothers you.")
-        self._last_restore_checkpoint = checkpoint
-        self._last_restore_object_graph = object_graph_proto
-    core_checkpointable._CheckpointPosition(  # pylint: disable=protected-access
-        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
-    load_status = CheckpointLoadStatus(
-        checkpoint, feed_dict=file_prefix_feed_dict)
-    return load_status
-
-
-class Checkpoint(core_checkpointable.Checkpointable):
-  """A utility class which groups `Checkpointable` objects.
-
-  Accepts arbitrary keyword arguments to its constructor and saves those values
-  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
-
-  Example usage:
-
-  ```python
-  import tensorflow as tf
-  import tensorflow.contrib.eager as tfe
-  import os
-
-  checkpoint_directory = "/tmp/training_checkpoints"
-  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-
-  root = tfe.Checkpoint(optimizer=optimizer, model=model)
-  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
-  for _ in range(num_training_steps):
-    optimizer.minimize( ... )
-  root.save(file_prefix=checkpoint_prefix)
-  ```
-
-  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
-
-  Attributes:
-    save_counter: Incremented when `save()` is called. Used to number
-      checkpoints.
-  """
-
-  def __init__(self, **kwargs):
-    """Group objects into a training checkpoint.
-
-    Args:
-      **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Attribute values must derive from
-        `CheckpointableBase`.
-    Raises:
-      ValueError: If objects in `kwargs` are not Checkpointable.
-    """
-    super(Checkpoint, self).__init__()
-    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, core_checkpointable.CheckpointableBase):
-        raise ValueError(
-            ("`Checkpoint` was expecting an object derived from "
-             "`CheckpointableBase`, got %s.") % (v,))
-      setattr(self, k, v)
-    self._save_counter = None  # Created lazily for restore-on-create.
-    self._saver = CheckpointableSaver(weakref.ref(self))
-
-  def _maybe_create_save_counter(self):
-    """Create a save counter if it does not yet exist."""
-    if self._save_counter is None:
-      # Initialized to 0 and incremented before saving.
-      with ops.device("/cpu:0"):
-        self._save_counter = add_variable(
-            self, name="save_counter", initializer=0, dtype=dtypes.int64)
-
-  @property
-  def save_counter(self):
-    """An integer variable which starts at zero and is incremented on save.
-
-    Used to number checkpoints.
-
-    Returns:
-      The save counter variable.
-    """
-    self._maybe_create_save_counter()
-    return self._save_counter
-
-  def save(self, file_prefix, session=None):
-    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
-      if session is None:
-        session = ops.get_default_session()
-      if self._save_counter is None:
-        # When graph building, if this is a new save counter variable then it
-        # needs to be initialized before assign_add. This is only an issue if
-        # restore() has not been called first.
-        session.run(self.save_counter.initializer)
-    with ops.colocate_with(self.save_counter):
-      assign_op = self.save_counter.assign_add(1)
-    if in_graph_mode:
-      session.run(assign_op)
-    return self._saver.save(
-        file_prefix=file_prefix,
-        checkpoint_number=self.save_counter,
-        session=session)
-
-  def restore(self, save_path):
-    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
-    status = self._saver.restore(save_path=save_path)
-    # Create the save counter now so it gets initialized with other variables
-    # when graph building. Creating it earlier would lead to double
-    # initialization when executing eagerly.
-    self._maybe_create_save_counter()
-    return status
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index 891c093a0f667deca6c26c453a83eca7305166a0..bd42d405db9d1275c83636dc83090fa11b0b74b1 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -16,58 +16,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 
-import six
-
-from tensorflow.contrib.eager.python import checkpointable_utils
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
+from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils
 from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import template
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import adam
 from tensorflow.python.training import checkpointable
-from tensorflow.python.training import saver as core_saver
-from tensorflow.python.training import training_util
-
-
-class NonLayerCheckpointable(checkpointable.Checkpointable):
-
-  def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
-        self, name="a_variable", shape=[])
-
-
-# pylint: disable=not-callable
-class MyModel(training.Model):
-  """A concrete Model for testing."""
-
-  def __init__(self):
-    super(MyModel, self).__init__()
-    self._named_dense = core.Dense(1, use_bias=True)
-    self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
-
-  def call(self, values):
-    ret = self._second(self._named_dense(values))
-    return ret
+from tensorflow.python.training import checkpointable_utils
 
 
 def _split_variable_closure(variable):
@@ -90,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
-    split_dependencies = checkpointable_utils.split_dependency(
+    split_dependencies = contrib_checkpointable_utils.split_dependency(
         component_names=("first_half", "second_half"),
         component_dtypes=(self.combined.dtype,) * 2,
         fill_save_buffer_fn=_split_variable_closure(
@@ -116,7 +73,7 @@ class OnlyOneDep(checkpointable.Checkpointable):
 
 class SplitTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testSaveRestoreSplitDep(self):
     save_checkpoint = checkpointable_utils.Checkpoint(
         dep=SaveTensorSlicesAsDeps())
@@ -151,1195 +108,5 @@ class SplitTests(test.TestCase):
         self.evaluate(restore_checkpoint.dep.combined))
 
 
-class InterfaceTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testAddVariable(self):
-    obj = NonLayerCheckpointable()
-    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
-          obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
-        obj, name="constant_initializer", initializer=1)
-    with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
-          obj,
-          name="ones_initializer",
-          shape=[2],
-          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
-        obj,
-        name="bare_initializer",
-        shape=[2, 2],
-        dtype=dtypes.float64,
-        initializer=init_ops.zeros_initializer)
-
-    # Even in graph mode, there are no naming conflicts between objects, only
-    # naming conflicts within an object.
-    other_duplicate = resource_variable_ops.ResourceVariable(
-        name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
-        obj, name="duplicate", shape=[])
-    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
-
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    self.assertEqual("constant_initializer:0", constant_initializer.name)
-    self.assertEqual(1, self.evaluate(constant_initializer))
-    self.assertEqual("some_variable_scope/ones_initializer:0",
-                     ones_initializer.name)
-    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
-    self.assertAllEqual([[0., 0.],
-                         [0., 0.]], self.evaluate(bare_initializer))
-    self.assertEqual("a_variable:0", obj.a_variable.name)
-    self.assertEqual("duplicate:0", other_duplicate.name)
-    if context.executing_eagerly():
-      # When executing eagerly, there's no uniquification of variable names. The
-      # checkpoint name will be the same.
-      self.assertEqual("duplicate:0", duplicate.name)
-    else:
-      # The .name attribute may be globally influenced, but the checkpoint name
-      # won't be (tested below).
-      self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
-    expected_checkpoint_names = (
-        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
-        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
-        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-    )
-    six.assertCountEqual(
-        self, expected_checkpoint_names, named_variables.keys())
-
-  def testInitNotCalled(self):
-
-    class NoInit(checkpointable.Checkpointable):
-
-      def __init__(self):
-        pass
-
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
-
-  def testShapeDtype(self):
-    root = checkpointable.Checkpointable()
-    v1 = checkpointable_utils.add_variable(
-        root, name="v1", initializer=3., dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
-        root,
-        name="v2",
-        shape=[3],
-        initializer=init_ops.ones_initializer,
-        dtype=dtypes.float64)
-    self.assertEqual(dtypes.float64, v2.dtype)
-    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
-
-
-class _MirroringSaveable(core_saver.BaseSaverBuilder.SaveableObject):
-
-  def __init__(self, primary_variable, mirrored_variable, name):
-    self._primary_variable = primary_variable
-    self._mirrored_variable = mirrored_variable
-    tensor = self._primary_variable.read_value()
-    spec = core_saver.BaseSaverBuilder.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name)
-    super(_MirroringSaveable, self).__init__(
-        tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into both variables."""
-    tensor, = restored_tensors
-    return control_flow_ops.group(
-        self._primary_variable.assign(tensor),
-        self._mirrored_variable.assign(tensor))
-
-
-class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
-  """A Checkpointable object which returns a more complex SaveableObject."""
-
-  def __init__(self):
-    self.non_dep_variable = variable_scope.get_variable(
-        name="non_dep_variable", initializer=6., use_resource=True)
-    self.mirrored = variable_scope.get_variable(
-        name="mirrored", initializer=15., use_resource=True)
-
-  def _gather_saveables_for_checkpoint(self):
-    def _saveable_factory(name=self.non_dep_variable.name):
-      return _MirroringSaveable(
-          primary_variable=self.non_dep_variable,
-          mirrored_variable=self.mirrored,
-          name=name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  # The Saver sorts by name before parsing, so we need a name property.
-  @property
-  def name(self):
-    return self.non_dep_variable.name
-
-
-class CheckpointingTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNamingWithOptimizer(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    # A nuisance Model using the same optimizer. Its slot variables should not
-    # go in the checkpoint, since it is never depended on.
-    other_model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
-        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
-    )
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    expected_checkpoint_names = [
-        name + suffix for name in expected_checkpoint_names]
-    six.assertCountEqual(self, expected_checkpoint_names,
-                         named_variables.keys())
-    # Check that we've mapped to the right variable objects (not exhaustive)
-    self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
-    self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
-    self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
-    self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
-    self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
-    # Spot check the generated protocol buffers.
-    self.assertEqual("optimizer",
-                     serialized_graph.nodes[0].children[1].local_name)
-    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
-        1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testMoreComplexSaveableReturned(self):
-    v = _OwnsMirroredVariables()
-    checkpoint = checkpointable_utils.Checkpoint(v=v)
-    test_dir = self.get_temp_dir()
-    prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
-    save_path = checkpoint.save(prefix)
-    self.evaluate(v.non_dep_variable.assign(43.))
-    self.evaluate(v.mirrored.assign(44.))
-    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-    self.assertEqual(42., self.evaluate(v.non_dep_variable))
-    self.assertEqual(42., self.evaluate(v.mirrored))
-    self.evaluate(v.non_dep_variable.assign(44.))
-    save_path = checkpoint.save(prefix)
-    self.evaluate(v.non_dep_variable.assign(45.))
-    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
-    self.assertEqual(44., self.evaluate(v.non_dep_variable))
-    self.assertEqual(44., self.evaluate(v.mirrored))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testMoreComplexSaveableReturnedWithGlobalName(self):
-    # The same object can also be saved using the name-based saver.
-    v = _OwnsMirroredVariables()
-    saver = core_saver.Saver(var_list=[v])
-    test_dir = self.get_temp_dir()
-    prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
-    with self.test_session() as sess:
-      save_path = saver.save(sess, prefix)
-      self.evaluate(v.non_dep_variable.assign(43.))
-      self.evaluate(v.mirrored.assign(44.))
-      saver.restore(sess, save_path)
-      self.assertEqual(42., self.evaluate(v.non_dep_variable))
-      self.assertEqual(42., self.evaluate(v.mirrored))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testSaveRestore(self):
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model)
-    input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
-    prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
-    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
-    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
-    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
-    # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
-    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
-    if not context.executing_eagerly():
-      return  # Restore-on-create is only supported when executing eagerly
-    on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
-    on_create_root = checkpointable_utils.Checkpoint(
-        optimizer=on_create_optimizer, model=on_create_model)
-    # Deferred restoration
-    status = on_create_root.restore(save_path=save_path)
-    on_create_model(constant_op.constant([[3.]]))  # create variables
-    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
-    self.assertAllEqual([42.],
-                        self.evaluate(
-                            on_create_model._named_dense.variables[1]))
-    on_create_m_bias_slot = on_create_optimizer.get_slot(
-        on_create_model._named_dense.variables[1], "m")
-    # Optimizer slot variables are created when the original variable is
-    # restored.
-    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
-    dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
-    status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
-
-  # TODO(allenl): Debug garbage created by this test in python3.
-  def testDeferredRestorationUsageEager(self):
-    """An idiomatic eager execution example."""
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=training_util.get_or_create_global_step())
-      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
-      for _ in range(num_training_steps):
-        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
-        input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
-      root.save(file_prefix=checkpoint_prefix)
-      self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
-
-  def testUsageGraph(self):
-    """Expected usage when graph building."""
-    with context.graph_mode():
-      num_training_steps = 10
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      for training_continuation in range(3):
-        with ops.Graph().as_default():
-          model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=training_util.get_or_create_global_step())
-          input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
-          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-          with self.test_session(graph=ops.get_default_graph()) as session:
-            status = root.restore(save_path=checkpoint_path)
-            status.initialize_or_restore(session=session)
-            if checkpoint_path is None:
-              self.assertEqual(0, training_continuation)
-              with self.assertRaises(AssertionError):
-                status.assert_consumed()
-            else:
-              status.assert_consumed()
-            for _ in range(num_training_steps):
-              session.run(train_op)
-            root.save(file_prefix=checkpoint_prefix, session=session)
-            self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
-            self.assertEqual(training_continuation + 1,
-                             session.run(root.save_counter))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testAgnosticUsage(self):
-    """Graph/eager agnostic usage."""
-    # Does create garbage when executing eagerly due to ops.Graph() creation.
-    num_training_steps = 10
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    for training_continuation in range(3):
-      with ops.Graph().as_default(), self.test_session(
-          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
-        model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
-        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
-        input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
-        if not context.executing_eagerly():
-          train_fn = functools.partial(self.evaluate, train_fn())
-        status.initialize_or_restore()
-        for _ in range(num_training_steps):
-          train_fn()
-        root.save(file_prefix=checkpoint_prefix)
-        self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
-        self.assertEqual(training_continuation + 1,
-                         self.evaluate(root.save_counter))
-
-  def _get_checkpoint_name(self, name):
-    root = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testVariableNameEscaping(self):
-    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
-    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
-    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
-    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
-    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
-    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
-                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testNumberedPath(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    variable_name, = named_variables.keys()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLocalNameValidation(self):
-    root = checkpointable.Checkpointable()
-    leaf = checkpointable.Checkpointable()
-    # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    name, = named_variables.keys()
-    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
-
-  def testAnonymousVarsInInit(self):
-
-    class Model(training.Model):
-
-      def __init__(self):
-        super(Model, self).__init__()
-        self.w = resource_variable_ops.ResourceVariable(0.0)
-        self.b = resource_variable_ops.ResourceVariable(0.0)
-        self.vars = [self.w, self.b]
-
-      def call(self, x):
-        return x * self.w + self.b
-
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLateDependencyTracking(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class LateDependencies(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        self.dep = Dependency()
-        self.dep.build()
-
-    original = LateDependencies()
-    original.add_dep()
-    self.evaluate(state_ops.assign(original.dep.var, 123.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(
-        original).save(checkpoint_prefix)
-    load_into = LateDependencies()
-    status = checkpointable_utils.CheckpointableSaver(
-        load_into).restore(save_path)
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    load_into.add_dep()
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(123., self.evaluate(load_into.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDepAfterVar(self):
-
-    class Dependency(checkpointable.Checkpointable):
-
-      def build(self):
-        self.var = checkpointable_utils.add_variable(
-            self, "var", initializer=0.)
-
-    class DepAfterVar(checkpointable.Checkpointable):
-
-      def add_dep(self):
-        dep = Dependency()
-        dep.build()
-        self.dep = dep
-
-    dep_after_var = DepAfterVar()
-    dep_after_var.add_dep()
-    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
-        checkpoint_prefix)
-
-    loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.CheckpointableSaver(
-        loaded_dep_after_var).restore(save_path)
-    loaded_dep_after_var.add_dep()
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testDeferredSlotRestoration(self):
-    checkpoint_directory = self.get_temp_dir()
-
-    root = checkpointable.Checkpointable()
-    root.var = checkpointable_utils.add_variable(
-        root, name="var", initializer=0.)
-    optimizer = adam.AdamOptimizer(0.1)
-    if context.executing_eagerly():
-      optimizer.minimize(root.var.read_value)
-    else:
-      train_op = optimizer.minimize(root.var)
-      # Note that `optimizer` has not been added as a dependency of
-      # `root`. Create a one-off grouping so that slot variables for `root.var`
-      # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
-      self.evaluate(train_op)
-    self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
-    root.optimizer = optimizer
-    self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
-                                   14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = checkpointable.Checkpointable()
-    # Load the slot-containing checkpoint (deferred), then immediately overwrite
-    # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
-    with self.assertRaises(AssertionError):
-      no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
-        new_root, name="var", shape=[])
-    no_slot_status.assert_consumed()
-    no_slot_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.AdamOptimizer(0.1)
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
-      slot_status.assert_consumed()
-    self.assertEqual(12., self.evaluate(new_root.var))
-    if context.executing_eagerly():
-      # Slot variables are only created with restoring initializers when
-      # executing eagerly.
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-    else:
-      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
-                    None)
-    if context.executing_eagerly():
-      new_root.optimizer.minimize(new_root.var.read_value)
-    else:
-      train_op = new_root.optimizer.minimize(new_root.var)
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      slot_status.run_restore_ops()
-      self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-      self.evaluate(train_op)
-    slot_status.assert_consumed()
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testOverlappingRestores(self):
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep = checkpointable.Checkpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
-        save_root.dep, name="var", initializer=0.)
-    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    saver = checkpointable_utils.CheckpointableSaver(save_root)
-    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
-    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
-
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    first_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    second_status.run_restore_ops()
-    self.assertEqual(13., self.evaluate(load_dep.var))
-
-    # Try again with the order of the restore() reversed. The last restore
-    # determines the final value.
-    first_root = checkpointable.Checkpointable()
-    second_root = checkpointable.Checkpointable()
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    load_dep = checkpointable.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
-        load_dep, name="var", shape=[])
-    first_root.dep = load_dep
-    first_status.assert_consumed()
-    first_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-    second_root.dep = load_dep
-    second_status.assert_consumed()
-    second_status.run_restore_ops()
-    self.assertEqual(12., self.evaluate(load_dep.var))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testAmbiguousLoad(self):
-    # Not OK to split one checkpoint object into two
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    dep_three = checkpointable.Checkpointable()
-    save_root.dep_one.dep_three = dep_three
-    save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = checkpointable.Checkpointable()
-    checkpointable_utils.CheckpointableSaver(load_root).restore(save_path)
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = checkpointable.Checkpointable()
-    load_root.dep_one.dep_three = checkpointable.Checkpointable()
-    with self.assertRaisesRegexp(AssertionError,
-                                 "resolved to different objects"):
-      load_root.dep_two.dep_three = checkpointable.Checkpointable()
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testObjectsCombined(self):
-    # Currently fine to load two checkpoint objects into one Python object
-    checkpoint_directory = self.get_temp_dir()
-    save_root = checkpointable.Checkpointable()
-    save_root.dep_one = checkpointable.Checkpointable()
-    save_root.dep_two = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
-        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = checkpointable.Checkpointable()
-    load_root.dep_one = checkpointable.Checkpointable()
-    load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
-        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertEqual(32., self.evaluate(v1))
-    self.assertEqual(64., self.evaluate(v2))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDependencyLoop(self):
-    # Note: this test creates garbage during eager execution because it
-    # purposefully creates a reference cycle.
-    first = checkpointable.Checkpointable()
-    second = checkpointable.Checkpointable()
-    first.second = second
-    second.first = first
-    first.v = checkpointable_utils.add_variable(
-        first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
-        second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(checkpointable_utils.gather_initializers(first))
-    checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-
-    # Test deferred loading
-    first_load = checkpointable.Checkpointable()
-    status = checkpointable_utils.CheckpointableSaver(
-        first_load).restore(save_path)
-    second_load = checkpointable.Checkpointable()
-    first_load.second = second_load
-    second_load.first = first_load
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
-        first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
-        second_load, "v2", shape=[4])
-    status.assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-    # Test loading when variables have already been created
-    self.evaluate(first_load.v.assign([2., 7., 1.]))
-    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
-    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
-    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
-        save_path).assert_consumed()
-    status.run_restore_ops()
-    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
-    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testRestoreOnAssign(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(save_graph):
-      first = checkpointable.Checkpointable()
-      first.var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      first.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      self.evaluate(first.var1.assign(4.))
-      self.evaluate(first.var2.assign(8.))
-      save_path = checkpointable_utils.CheckpointableSaver(first).save(
-          checkpoint_prefix)
-    restore_graph = ops.Graph()
-    with restore_graph.as_default(), self.test_session(restore_graph):
-      second = checkpointable.Checkpointable()
-      second.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      status = checkpointable_utils.CheckpointableSaver(
-          second).restore(save_path)
-      recreated_var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      status.run_restore_ops()
-      self.assertEqual(8., self.evaluate(second.var2))
-      self.evaluate(recreated_var1.assign(-2.))
-      self.assertEqual(-2., self.evaluate(recreated_var1))
-      second.var1 = recreated_var1
-      status.run_restore_ops()
-      self.assertEqual(4., self.evaluate(recreated_var1))
-
-  def testManySavesGraph(self):
-    """Saves after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
-        before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testCheckpointCleanup(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = checkpointable.Checkpointable()
-    obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    saver = checkpointable_utils.Checkpoint(obj=obj)
-    for _ in range(10):
-      saver.save(checkpoint_prefix)
-    expected_filenames = ["checkpoint"]
-    for checkpoint_number in range(6, 11):
-      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
-      expected_filenames.append(
-          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
-    six.assertCountEqual(
-        self,
-        expected_filenames,
-        os.listdir(checkpoint_directory))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def testCheckpointCleanupChangingVarList(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = checkpointable.Checkpointable()
-    obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
-    looped_variables = []
-    for iteration in range(10):
-      new_variable = resource_variable_ops.ResourceVariable(iteration)
-      self.evaluate(new_variable.initializer)
-      setattr(checkpoint, "var_%d" % iteration, new_variable)
-      checkpoint.save(checkpoint_prefix)
-      looped_variables.append(new_variable)
-    expected_filenames = ["checkpoint"]
-    # We've copied the saver each time, but checkpoint management should still
-    # be consistent.
-    for checkpoint_number in range(6, 11):
-      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
-      expected_filenames.append(
-          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
-    six.assertCountEqual(
-        self,
-        expected_filenames,
-        os.listdir(checkpoint_directory))
-    for v in looped_variables:
-      self.evaluate(v.assign(314))
-    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
-    self.assertEqual(314, self.evaluate(checkpoint.var_9))
-    self.assertEqual(314, self.evaluate(checkpoint.var_8))
-    self.assertEqual(314, self.evaluate(checkpoint.var_6))
-    self.assertEqual(5, self.evaluate(checkpoint.var_5))
-    self.assertEqual(1, self.evaluate(checkpoint.var_1))
-    self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    if context.executing_eagerly():
-      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-      self.assertEqual(9, self.evaluate(checkpoint.var_9))
-      self.assertEqual(8, self.evaluate(checkpoint.var_8))
-      self.assertEqual(1, self.evaluate(checkpoint.var_1))
-      self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    else:
-      # Restoring into modified graphs is an error while graph building.
-      with self.assertRaises(NotImplementedError):
-        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-
-  def testManyRestoresGraph(self):
-    """Restores after the first should not modify the graph."""
-    with context.graph_mode():
-      graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
-        checkpoint_directory = self.get_temp_dir()
-        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = checkpointable.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
-        before_ops = graph.get_operations()
-        saver.restore(save_path)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  def testMultipleGraphsNonSlotVariables(self):
-    with context.graph_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer = adam.AdamOptimizer(0.001)
-      # Construct a model in one graph
-      first_graph = ops.Graph()
-      first_session = session_lib.Session(graph=first_graph)
-      with first_graph.as_default(), first_session.as_default():
-        first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=first_variable)
-        train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
-        self.evaluate(train_op)
-        self.evaluate(first_variable.assign([1.]))
-        self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
-
-      # Save and load in a second graph
-      second_graph = ops.Graph()
-      with second_graph.as_default(), session_lib.Session(graph=second_graph):
-        second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=second_variable)
-        train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
-        self.evaluate(train_op)
-        self.evaluate(second_variable.assign([4.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
-        self.evaluate(second_variable.assign([7.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
-        status.assert_consumed().run_restore_ops()
-        self.assertAllEqual([4.], self.evaluate(second_variable))
-        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-
-      # Check that the first graph is unmolested
-      with first_graph.as_default(), first_session.as_default():
-        self.assertAllEqual([1.], self.evaluate(first_variable))
-        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_sequential(self):
-    model = sequential.Sequential()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
-    model.add(core.Dense(4))
-    second_dense = core.Dense(5)
-    model.add(second_dense)
-    model(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([1., 2., 3., 4., 5.])))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(second_dense.bias.assign(
-        constant_op.constant([5., 6., 7., 8., 9.])))
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
-
-    deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
-        model=deferred_sequential)
-    status = deferred_sequential_checkpoint.restore(save_path)
-    deferred_sequential.add(core.Dense(4))
-    deferred_sequential(constant_op.constant([[1.]]))
-    deferred_second_dense = core.Dense(5)
-    deferred_sequential.add(deferred_second_dense)
-    deferred_sequential(constant_op.constant([[1.]]))
-    status.run_restore_ops()
-    self.assertAllEqual([1., 2., 3., 4., 5.],
-                        self.evaluate(deferred_second_dense.bias))
-
-
-class TemplateTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def test_checkpointable_save_restore(self):
-
-    def _templated():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
-      v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer())
-      return v, v + 1., v2
-
-    save_template = template.make_template("s1", _templated)
-    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
-    v1_save, _, v2_save = save_template()
-    self.evaluate(v1_save.assign([12.]))
-    self.evaluate(v2_save.assign([14.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _templated)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
-    status = load_root.restore(save_path)
-    var, var_plus_one, var2 = load_template()
-    self.assertEqual(2, len(load_template._checkpoint_dependencies))
-    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
-    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([12.], self.evaluate(var))
-    self.assertAllEqual([13.], self.evaluate(var_plus_one))
-    self.assertAllEqual([14.], self.evaluate(var2))
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
-  def test_checkpointable_save_restore_nested(self):
-
-    def _inner_template():
-      v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
-      return v
-
-    def _outer_template():
-      first_inner = template.make_template("i1", _inner_template)
-      second_inner = template.make_template("i2", _inner_template)
-      v1 = first_inner()
-      v2 = second_inner()
-      v3 = second_inner()
-      return (first_inner, second_inner), (v1, v2, v3)
-
-    with variable_scope.variable_scope("ignored"):
-      save_template = template.make_template("s1", _outer_template)
-      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
-      (inner_template_one, inner_template_two), _ = save_template()
-    self.evaluate(inner_template_one.variables[0].assign([20.]))
-    self.evaluate(inner_template_two.variables[0].assign([25.]))
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = save_root.save(checkpoint_prefix)
-
-    load_template = template.make_template("s2", _outer_template)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
-    status = load_root.restore(save_path)
-    (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
-    outer_template_dependencies = load_root.my_template._checkpoint_dependencies
-    self.assertEqual(2, len(outer_template_dependencies))
-    self.assertEqual("i1", outer_template_dependencies[0].name)
-    self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
-    self.assertEqual("i2", outer_template_dependencies[1].name)
-    self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
-    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
-    self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
-    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
-    self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
-    status.assert_consumed().run_restore_ops()
-    self.assertAllEqual([20.], self.evaluate(v1))
-    self.assertAllEqual([25.], self.evaluate(v2))
-    self.assertAllEqual([25.], self.evaluate(v3))
-
-
-class CheckpointCompatibilityTests(test.TestCase):
-
-  def _initialized_model(self):
-    input_value = constant_op.constant([[3.]])
-    model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
-    self.evaluate(train_op)
-    # A regular variable, a slot variable, and a non-slot Optimizer variable
-    # with known values to check when loading.
-    self.evaluate(model._named_dense.bias.assign([1.]))
-    self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
-
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
-    self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
-        .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
-
-  def _check_sentinels(self, root_checkpointable):
-    self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
-    self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
-
-  def _write_name_based_checkpoint(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        name_saver = core_saver.Saver()
-        return name_saver.save(
-            sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testLoadFromNameBasedSaver(self):
-    """Save a name-based checkpoint, load it using the object-based API."""
-    with test_util.device(use_gpu=True):
-      save_path = self._write_name_based_checkpoint()
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      with self.assertRaises(AssertionError):
-        self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
-      status.run_restore_ops()
-      self._check_sentinels(root)
-      self._set_sentinels(root)
-      status.initialize_or_restore()
-      self._check_sentinels(root)
-
-  # TODO(allenl): Test for the core name-based saver loading object-based
-  # checkpoints once object-based checkpointing is in core.
-
-  def testSaveGraphLoadEager(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph) as session:
-        root = self._initialized_model()
-        object_saver = checkpointable_utils.CheckpointableSaver(root)
-        save_path = object_saver.save(
-            session=session, file_prefix=checkpoint_prefix)
-    with context.eager_mode():
-      root = self._initialized_model()
-      self._set_sentinels(root)
-      root.restore(save_path).assert_consumed()
-      self._check_sentinels(root)
-
-  def testSaveEagerLoadGraph(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    with context.eager_mode():
-      root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
-      save_path = object_saver.save(file_prefix=checkpoint_prefix)
-    with context.graph_mode():
-      save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph):
-        root = self._initialized_model()
-        self._set_sentinels(root)
-        root.restore(save_path).assert_consumed().run_restore_ops()
-        self._check_sentinels(root)
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index f76a896d3d8d795b5a7a0e97b5f688fb0291575a..7b123707cc3a26073088cf2c57c6211e831c19fd 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -27,7 +27,6 @@ from tensorflow.contrib import lookup
 from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
 from tensorflow.python.eager import test
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.training import checkpointable_utils
 
 
 class IteratorTest(test.TestCase):
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 37c8f0d47adbde6932bf409cdcae9a1845d700b5..7949a3f6da293abdd85512209242bae76ab4d816 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,12 +22,12 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 
 
 class Evaluator(object):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index a90048d813bf345e8be32e9674a452175471b268..be5d60449d7e08c99cc28e76befce56f468c77fd 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -315,32 +315,37 @@ def main(_):
   have_gpu = tfe.num_gpus() > 0
   use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu
 
-  with tfe.restore_variables_on_create(
-      tf.train.latest_checkpoint(FLAGS.logdir)):
-    with tf.device("/device:GPU:0" if have_gpu else None):
-      # Make learning_rate a Variable so it can be included in the checkpoint
-      # and we can resume training with the last saved learning_rate.
-      learning_rate = tfe.Variable(20.0, name="learning_rate")
-      sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
-      model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
-                       FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
-                       use_cudnn_rnn)
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-
-      best_loss = None
-      for _ in range(FLAGS.epoch):
-        train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
-        eval_loss = evaluate(model, eval_data)
-        if not best_loss or eval_loss < best_loss:
-          if FLAGS.logdir:
-            tfe.Saver(model.trainable_weights + [learning_rate]).save(
-                os.path.join(FLAGS.logdir, "ckpt"))
-          best_loss = eval_loss
-        else:
-          learning_rate.assign(learning_rate / 4.0)
-          sys.stderr.write("eval_loss did not reduce in this epoch, "
-                           "changing learning rate to %f for the next epoch\n" %
-                           learning_rate.numpy())
+  with tf.device("/device:GPU:0" if have_gpu else None):
+    # Make learning_rate a Variable so it can be included in the checkpoint
+    # and we can resume training with the last saved learning_rate.
+    learning_rate = tfe.Variable(20.0, name="learning_rate")
+    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
+                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
+                     use_cudnn_rnn)
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+    checkpoint = tfe.Checkpoint(
+        learning_rate=learning_rate, model=model,
+        # GradientDescentOptimizer has no state to checkpoint, but noting it
+        # here lets us swap in an optimizer that does.
+        optimizer=optimizer)
+    # Restore existing variables now (learning_rate), and restore new variables
+    # on creation if a checkpoint exists.
+    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
+    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
+
+    best_loss = None
+    for _ in range(FLAGS.epoch):
+      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
+      eval_loss = evaluate(model, eval_data)
+      if not best_loss or eval_loss < best_loss:
+        if FLAGS.logdir:
+          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
+        best_loss = eval_loss
+      else:
+        learning_rate.assign(learning_rate / 4.0)
+        sys.stderr.write("eval_loss did not reduce in this epoch, "
+                         "changing learning rate to %f for the next epoch\n" %
+                         learning_rate.numpy())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 9adf47d505fc2933d9c009e5863351bd123c3797..f825a2a7363fbe144162eca96398920ead0c4e50 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -33,8 +33,8 @@ import tensorflow as tf
 import tensorflow.contrib.eager as tfe
 from tensorflow.contrib.eager.python.examples.spinn import data
 from third_party.examples.eager.spinn import spinn
-from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
 from tensorflow.contrib.summary import summary_test_util
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_utils
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 2f2347736a073c7d9b3fb6685f52f8d58cc40570..907f9204c2d31a652ca2a0539a23db4722b4e154 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 15ac889191e0fe51269bc5740d5e0ab1bc0e2b72..f0fe4ce8c53bb80c03a3f0de37078bcdb975a0b4 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -21,9 +21,7 @@ from __future__ import print_function
 import os
 import tempfile
 
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -31,6 +29,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
 
 
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index e55a9276ab53f44f76dc5e537b3bdde7c975f463..2f8721324f5fc12565d047a64af22b8df215a92b 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -25,6 +25,7 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_utils
@@ -176,7 +177,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = base._get_default_graph_uid_map()
+        name_uid_map = keras_base_layer.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
@@ -326,6 +327,8 @@ class Network(base.Layer):
       raise TypeError(
           "Network.track_layer() passed type %s, not a tf.layers.Layer" %
           (type(layer),))
+    # Always use `ResourceVariable` with legacy layers.
+    layer._use_resource_variables = True
     if isinstance(layer, Network):
       layer._finalize_name(parent_network=self)
     else:
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 3329fc6c513265deff41a368f5688dd605209c14..f43376d5d777a7f17d975e07b746f7b1c731e8ea 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -20,12 +20,10 @@ import gc
 
 from tensorflow.contrib.eager.python import network
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import math_ops
@@ -469,36 +467,6 @@ class NetworkTest(test.TestCase):
     self.assertIsInstance(net.trainable_weights[0],
                           resource_variable_ops.ResourceVariable)
 
-  def testGraphOpNames(self):
-    """Network operation names should match variable naming."""
-
-    def _check_op_prefixes(expected_prefix, checked_ops):
-      for operation in ops.get_default_graph().get_operations():
-        if operation.name == "ignore":
-          continue
-        if operation.name in checked_ops:
-          continue
-        checked_ops.add(operation.name)
-        self.assertStartsWith(expected_start=expected_prefix,
-                              actual=operation.name)
-        self.assertNotIn("my_network", operation.name[len(expected_prefix):])
-        self.assertNotIn("dense", operation.name[len(expected_prefix):])
-
-    with context.graph_mode():
-      net = MyNetwork()
-      zero = constant_op.constant([[0.]], name="ignore")
-      net(zero)
-      checked_ops = set()
-      _check_op_prefixes(expected_prefix="my_network/dense/",
-                         checked_ops=checked_ops)
-      net.net2 = net.track_layer(MyNetwork())
-      net.net2(zero)
-      _check_op_prefixes(expected_prefix="my_network/my_network/dense/",
-                         checked_ops=checked_ops)
-      MyNetwork()(zero)
-      _check_op_prefixes(expected_prefix="my_network_1/dense/",
-                         checked_ops=checked_ops)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableRegularizers(self):
     net = RegularizedNetwork()
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index c6f3f20e781147140f2c4b339ed465ab7e919d37..79dd117854e5fe9f066f671d8ce62e08579e0ed9 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -84,8 +84,6 @@ from __future__ import print_function
 # pylint:disable=g-bad-import-order,g-import-not-at-top,unused-import
 #
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.eager.python.checkpointable_utils import CheckpointableSaver
-from tensorflow.contrib.eager.python.checkpointable_utils import Checkpoint
 from tensorflow.contrib.eager.python.datasets import Iterator
 from tensorflow.contrib.eager.python.network import Network
 from tensorflow.contrib.eager.python.network import Sequential
@@ -123,6 +121,8 @@ from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
 from tensorflow.python.training.checkpointable import Checkpointable
+from tensorflow.python.training.checkpointable_utils import CheckpointableSaver
+from tensorflow.python.training.checkpointable_utils import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index bec0329ebbd82b06fba6a8283500ad7f3a11b6a2..9f4cd44afbede286966ba0e7357c5dac92a2b729 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -23,6 +23,7 @@ py_library(
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
+        ":rnn",
         "//tensorflow/python:util",
     ],
 )
@@ -412,3 +413,57 @@ cuda_py_test(
         "notap",
     ],
 )
+
+py_library(
+    name = "rnn",
+    srcs = ["python/estimator/rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":extenders",
+        "//tensorflow/contrib/feature_column:feature_column_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:optimizers",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "rnn_test",
+    size = "medium",
+    srcs = ["python/estimator/rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index d2fc2c4bfa448227819c8d706387c1c75062b80b..be20d1b7770d3f3df21ac9c0f811d924bf4152ee 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
+from tensorflow.contrib.estimator.python.estimator.rnn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -52,6 +53,7 @@ _allowed_symbols = [
     'linear_logit_fn_builder',
     'replicate_model_fn',
     'TowerOptimizer',
+    'RNNClassifier',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index e99a87f3b3c0e7c5840fa250506e600645bf6a29..eee59106876f6c44725bcbba1ef3d3c803475dbf 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
@@ -69,10 +70,18 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateEstimator(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -88,9 +97,10 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -108,31 +118,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
-
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
-
-
-class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
   def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=True)
@@ -145,36 +137,16 @@ class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
-
-    # Check predict that all labels are correct.
+    # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testRegressorTrainInMemoryAndEvalAndInfer(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -187,20 +159,17 @@ class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase):
         n_trees=1,
         max_depth=5)
     # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     # Check eval.
     eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.2136638)
-
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
     # Validate predictions.
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index bbbc19cc4dfb4b23f9b707023fbfdd124f1f48de..ce758992140d43529037b14cbbf958d5aa763fb4 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -345,7 +345,7 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         if k == _DEFAULT_SERVING_KEY:
           key = head_name
         else:
-          key = '%s/%s' % (k, head_name)
+          key = '%s/%s' % (head_name, k)
         export_outputs[key] = v
         if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
             isinstance(v, export_output_lib.PredictOutput)):
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index d9e5aca2952d25a7d917f9d76f95ab89733115a0..3d6fccb1180c435f64552667306be004437f62ba 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -127,8 +127,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -169,11 +169,11 @@ class MultiHeadTest(test.TestCase):
       self.assertAllClose(
           expected_probabilities['head1'],
           sess.run(
-              spec.export_outputs['predict/head1'].outputs['probabilities']))
+              spec.export_outputs['head1/predict'].outputs['probabilities']))
       self.assertAllClose(
           expected_probabilities['head2'],
           sess.run(
-              spec.export_outputs['predict/head2'].outputs['probabilities']))
+              spec.export_outputs['head2/predict'].outputs['probabilities']))
 
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
@@ -197,8 +197,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -254,8 +254,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'regression/head1',
-         'predict/head1', 'head2', 'regression/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/regression',
+         'head1/predict', 'head2', 'head2/regression', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index fa2697800ec1a44f215f3d5fc9be2197a9e58219..a8774d6dab9205439e6e312827f9cd1306e3f1ea 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -456,7 +456,7 @@ def _get_local_devices(device_type):
 
 
 def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
+  """Split input features and labels into batches."""
 
   def ensure_divisible_by_shards(sequence):
     batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
@@ -602,7 +602,7 @@ def _local_device_setter(worker_device, ps_devices, ps_strategy):
 
 
 def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
+  """Produce an EstimatorSpec with appropriately scaled loss."""
   if tower_spec.loss is None:
     return tower_spec
 
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b475c12f5af3aedc766a0880a98c5c1e29bddbb7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -0,0 +1,481 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import training_util
+
+
+# The defaults are historical artifacts of the initial implementation, but seem
+# reasonable choices.
+_DEFAULT_LEARNING_RATE = 0.05
+_DEFAULT_CLIP_NORM = 5.0
+
+_CELL_TYPES = {'basic_rnn': rnn_cell.BasicRNNCell,
+               'lstm': rnn_cell.BasicLSTMCell,
+               'gru': rnn_cell.GRUCell}
+
+# Indicates no value was provided by the user to a kwarg.
+USE_DEFAULT = object()
+
+
+def _single_rnn_cell(num_units, cell_type):
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, rnn_cell.RNNCell):
+    raise ValueError('Supported cell types are {}; got {}'.format(
+        list(_CELL_TYPES.keys()), cell_type))
+  return cell_type(num_units=num_units)
+
+
+def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
+  """Convenience function to create `rnn_cell_fn` for canned RNN Estimators.
+
+  Args:
+    num_units: Iterable of integer number of hidden units per RNN layer.
+    cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+      the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+      `'gru'`.
+
+  Returns:
+    A function that takes a single argument, an instance of
+    `tf.estimator.ModeKeys`, and returns an instance derived from
+    `tf.nn.rnn_cell.RNNCell`.
+
+  Raises:
+    ValueError: If cell_type is not supported.
+  """
+  def rnn_cell_fn(mode):
+    # Unused. Part of the rnn_cell_fn interface since user specified functions
+    # may need different behavior across modes (e.g. dropout).
+    del mode
+    cells = [_single_rnn_cell(n, cell_type) for n in num_units]
+    if len(cells) == 1:
+      return cells[0]
+    return rnn_cell.MultiRNNCell(cells)
+  return rnn_cell_fn
+
+
+def _concatenate_context_input(sequence_input, context_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def _select_last_activations(activations, sequence_lengths):
+  """Selects the nth set of activations for each n in `sequence_length`.
+
+  Returns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
+  `None`, then `output[i, :] = activations[i, sequence_length[i] - 1, :]`. If
+  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
+
+  Args:
+    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
+    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
+  Returns:
+    A `Tensor` of shape `[batch_size, k]`.
+  """
+  with ops.name_scope(
+      'select_last_activations', values=[activations, sequence_lengths]):
+    activations_shape = array_ops.shape(activations)
+    batch_size = activations_shape[0]
+    padded_length = activations_shape[1]
+    output_units = activations_shape[2]
+    if sequence_lengths is None:
+      sequence_lengths = padded_length
+    start_indices = math_ops.to_int64(
+        math_ops.range(batch_size) * padded_length)
+    last_indices = start_indices + sequence_lengths - 1
+    reshaped_activations = array_ops.reshape(
+        activations, [batch_size * padded_length, output_units])
+
+    last_activations = array_ops.gather(reshaped_activations, last_indices)
+    last_activations.set_shape([activations.shape[0], activations.shape[2]])
+    return last_activations
+
+
+def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
+                          context_feature_columns, input_layer_partitioner):
+  """Function builder for a rnn logit_fn.
+
+  Args:
+    output_units: An int indicating the dimension of the logit layer.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent sequential input.
+    context_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent contextual input.
+    input_layer_partitioner: Partitioner for input layer.
+
+  Returns:
+    A logit_fn (see below).
+
+  Raises:
+    ValueError: If output_units is not an int.
+  """
+  if not isinstance(output_units, int):
+    raise ValueError('output_units must be an int.  Given type: {}'.format(
+        type(output_units)))
+
+  def rnn_logit_fn(features, mode):
+    """Recurrent Neural Network logit_fn.
+
+    Args:
+      features: This is the first item returned from the `input_fn`
+                passed to `train`, `evaluate`, and `predict`. This should be a
+                single `Tensor` or `dict` of same.
+      mode: Optional. Specifies if this training, evaluation or prediction. See
+            `ModeKeys`.
+
+    Returns:
+      A `Tensor` representing the logits.
+    """
+    with variable_scope.variable_scope(
+        'sequence_input_layer',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      sequence_input, sequence_length = seq_fc.sequence_input_layer(
+          features=features, feature_columns=sequence_feature_columns)
+      summary.histogram('sequence_length', sequence_length)
+
+      if context_feature_columns:
+        context_input = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=context_feature_columns)
+        sequence_input = _concatenate_context_input(sequence_input,
+                                                    context_input)
+
+    cell = rnn_cell_fn(mode)
+    # Ignore output state.
+    rnn_outputs, _ = rnn.dynamic_rnn(
+        cell=cell,
+        inputs=sequence_input,
+        dtype=dtypes.float32,
+        time_major=False)
+    last_activations = _select_last_activations(rnn_outputs, sequence_length)
+
+    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
+      logits = core_layers.dense(
+          last_activations,
+          units=output_units,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer())
+    return logits
+
+  return rnn_logit_fn
+
+
+def _rnn_model_fn(features,
+                  labels,
+                  mode,
+                  head,
+                  rnn_cell_fn,
+                  sequence_feature_columns,
+                  context_feature_columns,
+                  optimizer='Adagrad',
+                  input_layer_partitioner=None,
+                  config=None):
+  """Recurrent Neural Net model_fn.
+
+  Args:
+    features: dict of `Tensor` and `SparseTensor` objects returned from
+      `input_fn`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] with labels.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: Iterable containing `FeatureColumn`s that
+      represent sequential model inputs.
+    context_feature_columns: Iterable containing `FeatureColumn`s that
+      represent model inputs not associated with a specific timestep.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05 and gradient clip norm of
+      5.0.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or optimizer is invalid, or features has the wrong type.
+  """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
+
+  # If user does not provide an optimizer instance, use the optimizer specified
+  # by the string with default learning rate and gradient clipping.
+  if not isinstance(optimizer, optimizer_lib.Optimizer):
+    optimizer = optimizers.get_optimizer_instance(
+        optimizer, learning_rate=_DEFAULT_LEARNING_RATE)
+    optimizer = extenders.clip_gradients_by_norm(optimizer, _DEFAULT_CLIP_NORM)
+
+  num_ps_replicas = config.num_ps_replicas if config else 0
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'rnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+
+    logit_fn = _rnn_logit_fn_builder(
+        output_units=head.logits_dimension,
+        rnn_cell_fn=rnn_cell_fn,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        input_layer_partitioner=input_layer_partitioner)
+    logits = logit_fn(features=features, mode=mode)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class RNNClassifier(estimator.Estimator):
+  """A classifier for TensorFlow RNN models.
+
+  Trains a recurrent neural network model to classify instances into one of
+  multiple classes.
+
+  Example:
+
+  ```python
+  token_sequence = sequence_categorical_column_with_hash_bucket(...)
+  token_emb = embedding_column(categorical_column=token_sequence, ...)
+
+  estimator = RNNClassifier(
+      num_units=[32, 16], cell_type='lstm',
+      sequence_feature_columns=[token_emb])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `sequence_feature_columns`:
+    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
+  * for each `column` in `context_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_units=None,
+               cell_type=USE_DEFAULT,
+               rnn_cell_fn=None,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Adagrad',
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `RNNClassifier` instance.
+
+    Args:
+      sequence_feature_columns: An iterable containing the `FeatureColumn`s
+        that represent sequential input. All items in the set should either be
+        sequence columns (e.g. `sequence_numeric_column`) or constructed from
+        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
+        input).
+      context_feature_columns: An iterable containing the `FeatureColumn`s
+        for contextual input. The data represented by these columns will be
+        replicated and given to the RNN at each timestep. These columns must be
+        instances of classes derived from `_DenseColumn` such as
+        `numeric_column`, not the sequential variants.
+      num_units: Iterable of integer number of hidden units per RNN layer. If
+        set, `cell_type` must also be specified and `rnn_cell_fn` must be
+        `None`.
+      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
+        must be `None`.
+      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
+        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
+        This is for advanced users who need additional customization beyond
+        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
+        needed for stacked RNNs.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
+        compatible.
+    """
+    if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
+      raise ValueError(
+          'num_units and cell_type must not be specified when using rnn_cell_fn'
+      )
+    if not rnn_cell_fn:
+      if cell_type == USE_DEFAULT:
+        cell_type = 'basic_rnn'
+      rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _rnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          rnn_cell_fn=rnn_cell_fn,
+          sequence_feature_columns=tuple(sequence_feature_columns or []),
+          context_feature_columns=tuple(context_feature_columns or []),
+          optimizer=optimizer,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(RNNClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..393f94f5c7de02c56d93993bbeb8aaec4ea8234c
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -0,0 +1,1131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import rnn
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+# Names of variables created by BasicRNNCell model.
+TOKEN_EMBEDDING_NAME = 'rnn/sequence_input_layer/input_layer/tokens_sequential_embedding/embedding_weights'
+CELL_WEIGHTS_NAME = 'rnn/rnn/basic_rnn_cell/kernel'
+CELL_BIAS_NAME = 'rnn/rnn/basic_rnn_cell/bias'
+MULTI_CELL_WEIGHTS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/kernel'
+MULTI_CELL_BIAS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/bias'
+LOGITS_WEIGHTS_NAME = 'rnn/logits/dense/kernel'
+LOGITS_BIAS_NAME = 'rnn/logits/dense/bias'
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
+                      global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    rnn_weights: Iterable of values of weights for the RNN cell.
+    rnn_biases: Iterable of values of biases for the RNN cell.
+    logits_weights: Iterable of values for matrix connecting RNN output to
+      logits.
+    logits_biases: Iterable of values for logits bias term.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  model_weights = {}
+  model_weights[CELL_WEIGHTS_NAME] = rnn_weights
+  model_weights[CELL_BIAS_NAME] = rnn_biases
+  model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
+  model_weights[LOGITS_BIAS_NAME] = logits_biases
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+    assign_op = global_step_var.assign(global_step)
+
+    # Initialize vars and save checkpoint.
+    with monitored_session.MonitoredTrainingSession(
+        checkpoint_dir=model_dir) as sess:
+      sess.run(assign_op)
+
+
+class RNNLogitFnTest(test.TestCase):
+  """Tests correctness of logits calculated from _rnn_logit_fn_builder."""
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
+                   sequence_feature_columns, context_feature_columns,
+                   expected_logits):
+    """Tests that the expected logits are calculated."""
+    with ops.Graph().as_default():
+      # Global step needed for MonitoredSession, which is in turn used to
+      # explicitly set variable weights through a checkpoint.
+      training_util.create_global_step()
+      # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
+      # the checkpoint naming is shared.
+      with variable_scope.variable_scope('rnn'):
+        input_layer_partitioner = (
+            partitioned_variables.min_max_variable_partitioner(
+                max_partitions=0, min_slice_size=64 << 20))
+        logit_fn = rnn._rnn_logit_fn_builder(
+            output_units=logits_dimension,
+            rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
+            sequence_feature_columns=sequence_feature_columns,
+            context_feature_columns=context_feature_columns,
+            input_layer_partitioner=input_layer_partitioner)
+        # Features are constructed within this function, otherwise the Tensors
+        # containing the features would be defined outside this graph.
+        logits = logit_fn(features=features_fn(), mode=mode)
+        with monitored_session.MonitoredTrainingSession(
+            checkpoint_dir=self._model_dir) as sess:
+          self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
+
+  def testOneDimLogits(self):
+    """Tests one-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3]] = [[-0.6033]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033]])
+
+  def testMultiDimLogits(self):
+    """Tests multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [0.5*0.53 + 0.3*0.37 + 0.4],
+              [0.2*0.53 - 0.1*0.37 + 0.5]
+           = [[-0.6033, 0.7777, 0.5698]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698]])
+
+  def testMultiExampleMultiDim(self):
+    """Tests multiple examples and multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [7]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [tanh(.1*7 + .2*.38 + .3*.10 +.2),
+                              tanh(-.2*7 - .3*.38 - .4*.10 +.5)]]
+                          = [[0.53, -0.37], [0.76, -0.78]
+    logits = [[-1*0.53 - 1*0.37 + 0.3,
+               0.5*0.53 + 0.3*0.37 + 0.4,
+               0.2*0.53 - 0.1*0.37 + 0.5],
+              [-1*0.76 - 1*0.78 + 0.3,
+               0.5*0.76 +0.3*0.78 + 0.4,
+               0.2*0.76 -0.1*0.78 + 0.5]]
+           = [[-0.6033, 0.7777, 0.5698], [-1.2473, 1.0170, 0.5745]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))
+    ]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698],
+                           [-1.2473, 1.0170, 0.5745]])
+
+  def testMultiExamplesDifferentLength(self):
+    """Tests multiple examples with different lengths.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.53, -0.37], [<ignored-padding>]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [-1*0.38 + 1*0.10 + 0.3]]
+           = [[-0.6033], [0.0197]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033], [0.0197]])
+
+  def testMultiExamplesWithContext(self):
+    """Tests multiple examples with context features.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10, -0.5], [5, -0.5]], [[2, 0.8], [0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 - 1*.5 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - 0.9*.5 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + 1*.8 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 + .9*.8 - .3*0 - .4*0 +.5)]]
+                          = [[0.60, -0.96], [0.83, 0.68]]
+    rnn_output_timestep_2 = [[tanh(.1*5 - 1*.5 + .2*.60 - .3*.96 +.2),
+                              tanh(-.2*5 - .9*.5 - .3*.60 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.03, -0.63], [<ignored-padding>]]
+    logits = [[-1*0.03 - 1*0.63 + 0.3],
+              [-1*0.83 + 1*0.68 + 0.3]]
+           = [[-0.3662], [0.1414]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # Context features weights are inserted between input and state weights.
+        rnn_weights=[[.1, -.2], [1., 0.9], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'context': [[-0.5], [0.8]],
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = [fc.numeric_column('context', shape=(1,))]
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.3662], [0.1414]])
+
+  def testMultiExamplesMultiFeatures(self):
+    """Tests examples with multiple sequential feature columns.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[1, 0, 10], [0, 1, 5]], [[1, 0, 2], [0, 0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.5*1 + 1*0 + .1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.5*1 + 1*0 + .1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.94, -0.96], [0.72, -0.38]]
+    rnn_output_timestep_2 = [[tanh(.5*0 + 1*1 + .1*5 + .2*.94 - .3*.96 +.2),
+                              tanh(-.5*0 - 1*1 - .2*5 - .3*.94 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.92, -0.88], [<ignored-padding>]]
+    logits = [[-1*0.92 - 1*0.88 + 0.3],
+              [-1*0.72 - 1*0.38 + 0.3]]
+           = [[-1.5056], [-0.7962]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # FeatureColumns are sorted alphabetically, so on_sale weights are
+        # inserted before price.
+        rnn_weights=[[.5, -.5], [1., -1.], [.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'on_sale':
+              sparse_tensor.SparseTensor(
+                  values=[0, 1, 0],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    price_column = seq_fc.sequence_numeric_column('price', shape=(1,))
+    on_sale_column = fc.indicator_column(
+        seq_fc.sequence_categorical_column_with_identity(
+            'on_sale', num_buckets=2))
+    sequence_feature_columns = [price_column, on_sale_column]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-1.5056], [-0.7962]])
+
+
+class RNNClassifierTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _assert_checkpoint(
+      self, n_classes, input_units, cell_units, expected_global_step):
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    # RNN Cell variables.
+    if len(cell_units) > 1:
+      for i, cell_unit in enumerate(cell_units):
+        self.assertEqual([input_units + cell_unit, cell_unit],
+                         shapes[MULTI_CELL_WEIGHTS_NAME_PATTERN % i])
+        self.assertEqual([cell_unit],
+                         shapes[MULTI_CELL_BIAS_NAME_PATTERN % i])
+        input_units = cell_unit
+    elif len(cell_units) == 1:
+      self.assertEqual([input_units + cell_unit, cell_unit],
+                       shapes[CELL_WEIGHTS_NAME])
+      self.assertEqual([cell_unit], shapes[CELL_BIAS_NAME])
+
+    # Logits variables.
+    logits_dimension = n_classes if n_classes > 2 else 1
+    self.assertEqual([cell_units[-1], logits_dimension],
+                     shapes[LOGITS_WEIGHTS_NAME])
+    self.assertEqual([logits_dimension], shapes[LOGITS_BIAS_NAME])
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % CELL_BIAS_NAME,
+        '%s/part_0:0' % CELL_WEIGHTS_NAME,
+        '%s/part_0:0' % LOGITS_BIAS_NAME,
+        '%s/part_0:0' % LOGITS_WEIGHTS_NAME,
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def testConflictingRNNCellFn(self):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    cell_units = [4, 2]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          num_units=cell_units)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          cell_type='lstm')
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        num_units=cell_units,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def testFromScratchWithCustomRNNCellFn(self):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+    cell_units = [4, 2]
+    n_classes = 2
+
+    def rnn_cell_fn(mode):
+      del mode  # unused
+      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
+      return rnn_cell.MultiRNNCell(cells)
+
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        rnn_cell_fn=rnn_cell_fn,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def _testExampleWeight(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat', 'dog', 'barked'],
+                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
+                  dense_shape=[2, 3]),
+          'w': [[1], [2]],
+      }, [[1], [0]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=[embed],
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=2)
+
+  def testMultiClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=4)
+
+  def testBinaryClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testBinaryClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=1.119661)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testMultiClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testMultiClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=2.662932)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+class RNNClassifierEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExamplesWithDifferentLength.
+    # See that test for logits calculation.
+    # logits = [[-0.603282], [0.019719]]
+    # probability = exp(logits) / (1 + exp(logits)) = [[0.353593], [0.504930]]
+    # loss = -label * ln(p) - (1 - label) * ln(1 - p)
+    #      = [[0.436326], [0.683335]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 1.119661,
+        metric_keys.MetricKeys.LOSS_MEAN: 0.559831,
+        metric_keys.MetricKeys.ACCURACY: 1.0,
+        metric_keys.MetricKeys.PREDICTION_MEAN: 0.429262,
+        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+        # With default threshold of 0.5, the model is a perfect classifier.
+        metric_keys.MetricKeys.RECALL: 1.0,
+        metric_keys.MetricKeys.PRECISION: 1.0,
+        # Positive example is scored above negative, so AUC = 1.0.
+        metric_keys.MetricKeys.AUC: 1.0,
+        metric_keys.MetricKeys.AUC_PR: 1.0,
+    }
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+  def testMultiClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExampleMultiDim.
+    # See that test for logits calculation.
+    # logits = [[-0.603282, 0.777708, 0.569756],
+    #           [-1.247356, 1.017018, 0.574481]]
+    # logits_exp = exp(logits) / (1 + exp(logits))
+    #            = [[0.547013, 2.176468, 1.767836],
+    #               [0.287263, 2.764937, 1.776208]]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [[0.121793, 0.484596, 0.393611],
+    #                          [0.059494, 0.572639, 0.367866]]
+    # loss = -1. * log(softmax[label])
+    #      = [[2.105432], [0.557500]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 2.662932,
+        metric_keys.MetricKeys.LOSS_MEAN: 1.331466,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+    }
+
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+
+class RNNClassifierPredictionTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testOneDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282]
+    # logistic = exp(-0.6033) / (1 + exp(-0.6033)) = [0.353593]
+    # probabilities = [0.646407, 0.353593]
+    # class_ids = argmax(probabilities) = [0]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose([0.353593],
+                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
+    self.assertAllClose(
+        [0.646407, 0.353593],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([0],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_0'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def testMultiClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1', 'class_2']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testMultiDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282, 0.777708, 0.569756]
+    # logits_exp = exp(logits) = [0.547013, 2.176468, 1.767836]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [0.121793, 0.484596, 0.393611]
+    # class_ids = argmax(probabilities) = [1]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282, 0.777708, 0.569756],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose(
+        [0.121793, 0.484596, 0.393611],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([1],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_1'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+
+class RNNClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
+      batch_size):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUATE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def testNumpyInputFn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    batch_size = 10
+    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
+    # Numpy only supports dense input, so all examples will have same length.
+    # TODO(b/73160931): Update test when support for prepadded data exists.
+    sequence_length = 3
+
+    features = []
+    for _ in range(batch_size):
+      sentence = random.sample(words, sequence_length)
+      features.append(sentence)
+
+    x_data = np.array(features)
+    y_data = np.random.randint(n_classes, size=batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def testParseExampleInputFn(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    n_classes = 3
+    batch_size = 10
+    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
+
+    serialized_examples = []
+    for _ in range(batch_size):
+      sequence_length = random.randint(1, len(words))
+      sentence = random.sample(words, sequence_length)
+      label = random.randint(0, n_classes - 1)
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'tokens':
+                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                      value=sentence)),
+              'label':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[label])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      features = parsing_ops.parse_example(serialized_examples, feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _eval_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _predict_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features.pop('label')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index cbb68bd3eb257f9472515e5c29ce4f02057be321..4a5ed0ab0f97f7bbaf9d393aae34eac09bf38717 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -72,6 +72,7 @@ See the @{$python/contrib.framework} guide.
 @@variable
 @@VariableDeviceChooser
 @@convolutional_delta_orthogonal
+@@convolutional_orthogonal_2d
 @@zero_initializer
 
 @@load_checkpoint
@@ -115,9 +116,11 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
+from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index a97adf622e6e576f8b5ce2babe004cb3a46d80a5..983b6dc8e5a1512ba81ecbc8d5ca5adaea09afe4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -65,7 +65,7 @@ def fused_conv2d_bias_activation(conv_input,
     side_input_scale: A scalar `float32` that will be multiplied by side_input.
         This is optional and defaults to 0.
     side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
+        This is useful for implementing ResNet blocks.
     activation_mode: (optional) currently must be the default "Relu".
         Note that in qint8 mode, it also clips to 127, so acts like ReluX.
     data_format: Specifies the data format.
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index bb155aa2496cbafd9f0630d3dffb2ba69395186c..3d0ed899322c26bf4ae428930899d7a5885e9f21 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -566,7 +566,7 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
   """Calculates the size of an output dimension of a strided convolution.
 
   Given the sizes of the corresponding dimension of the input and filter shapes,
@@ -827,10 +827,10 @@ class FusedConvInt8Tests(test.TestCase):
             maxval=1.0,
             dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
 
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
+    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
+                                                vertical_stride, padding_type)
+    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
+                                               horizontal_stride, padding_type)
     print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 461066bbb493932b342cee8f8842e899a2d84fff..b305f37791d71f5a6edeada2bb710a2e5f23087d 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -364,6 +364,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 663e49bdca3cb2dd9257da326488c877fcc4256d..4fb8d58bc9125664d42260de72b83b2362eff9ba 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -22,6 +22,7 @@ import os
 import tarfile
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 from scipy import linalg as scp_linalg
 
@@ -182,13 +183,20 @@ def _run_with_mock(function, *args, **kwargs):
     return function(*args, **kwargs)
 
 
-class ClassifierMetricsTest(test.TestCase):
+class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
 
-  def test_run_inception_graph(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph(self, use_default_graph_def):
     """Test `run_inception` graph construction."""
     batch_size = 7
     img = array_ops.ones([batch_size, 299, 299, 3])
-    logits = _run_with_mock(classifier_metrics.run_inception, img)
+
+    if use_default_graph_def:
+      logits = _run_with_mock(classifier_metrics.run_inception, img)
+    else:
+      logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
     self.assertTrue(isinstance(logits, ops.Tensor))
     logits.shape.assert_is_compatible_with([batch_size, 1001])
@@ -196,14 +204,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
-  def test_run_inception_graph_pool_output(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph_pool_output(self, use_default_graph_def):
     """Test `run_inception` graph construction with pool output."""
     batch_size = 3
     img = array_ops.ones([batch_size, 299, 299, 3])
-    pool = _run_with_mock(
-        classifier_metrics.run_inception,
-        img,
-        output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+
+    if use_default_graph_def:
+      pool = _run_with_mock(
+          classifier_metrics.run_inception,
+          img,
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+    else:
+      pool = classifier_metrics.run_inception(
+          img, _get_dummy_graphdef(),
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
     self.assertTrue(isinstance(pool, ops.Tensor))
     pool.shape.assert_is_compatible_with([batch_size, 2048])
diff --git a/tensorflow/contrib/graph_editor/select.py b/tensorflow/contrib/graph_editor/select.py
index 3ea6ff4d6163b107ca0daaf3b9ad1daf0ccc1f6f..d700e6e1a7523622f845acbbc353eb0f438c9bc2 100644
--- a/tensorflow/contrib/graph_editor/select.py
+++ b/tensorflow/contrib/graph_editor/select.py
@@ -383,6 +383,7 @@ def get_within_boundary_ops(ops,
 def get_forward_walk_ops(seed_ops,
                          inclusive=True,
                          within_ops=None,
+                         within_ops_fn=None,
                          stop_at_ts=(),
                          control_outputs=None):
   """Do a forward graph walk and return all the visited ops.
@@ -395,6 +396,9 @@ def get_forward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_outputs: a `util.ControlOutputs` instance or None.
       If not `None`, it will be used while walking the graph forward.
@@ -423,7 +427,8 @@ def get_forward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -450,6 +455,7 @@ def get_forward_walk_ops(seed_ops,
 def get_backward_walk_ops(seed_ops,
                           inclusive=True,
                           within_ops=None,
+                          within_ops_fn=None,
                           stop_at_ts=(),
                           control_inputs=False):
   """Do a backward graph walk and return all the visited ops.
@@ -462,6 +468,9 @@ def get_backward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_inputs: if True, control inputs will be used while moving backward.
   Returns:
@@ -488,7 +497,8 @@ def get_backward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -516,6 +526,7 @@ def get_walks_intersection_ops(forward_seed_ops,
                                forward_inclusive=True,
                                backward_inclusive=True,
                                within_ops=None,
+                               within_ops_fn=None,
                                control_inputs=False,
                                control_outputs=None,
                                control_ios=None):
@@ -535,6 +546,9 @@ def get_walks_intersection_ops(forward_seed_ops,
     within_ops: an iterable of tf.Operation within which the search is
       restricted. If within_ops is None, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -555,11 +569,13 @@ def get_walks_intersection_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return [op for op in forward_ops if op in backward_ops]
 
@@ -569,6 +585,7 @@ def get_walks_union_ops(forward_seed_ops,
                         forward_inclusive=True,
                         backward_inclusive=True,
                         within_ops=None,
+                        within_ops_fn=None,
                         control_inputs=False,
                         control_outputs=None,
                         control_ios=None):
@@ -587,6 +604,9 @@ def get_walks_union_ops(forward_seed_ops,
       resulting set.
     within_ops: restrict the search within those operations. If within_ops is
       None, the search is done within the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -607,11 +627,13 @@ def get_walks_union_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return util.concatenate_unique(forward_ops, backward_ops)
 
diff --git a/tensorflow/contrib/graph_editor/tests/select_test.py b/tensorflow/contrib/graph_editor/tests/select_test.py
index 82f999637d0c1866a5a329974f021fe2e30fd33f..d12c6d3cbd11dde2b609a59154297a8907b0cadc 100644
--- a/tensorflow/contrib/graph_editor/tests/select_test.py
+++ b/tensorflow/contrib/graph_editor/tests/select_test.py
@@ -77,12 +77,10 @@ class SelectTest(test.TestCase):
     """Test for ge.get_ops_ios."""
     control_outputs = ge.util.ControlOutputs(self.graph)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.h.op, control_ios=control_outputs)), 3)
+        len(ge.get_ops_ios(self.h.op, control_ios=control_outputs)), 3)
     self.assertEqual(len(ge.get_ops_ios(self.h.op)), 2)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.c.op, control_ios=control_outputs)), 6)
+        len(ge.get_ops_ios(self.c.op, control_ios=control_outputs)), 6)
     self.assertEqual(len(ge.get_ops_ios(self.c.op)), 5)
 
   def test_compute_boundary_ts_0(self):
@@ -135,16 +133,49 @@ class SelectTest(test.TestCase):
     ops = ge.get_walks_intersection_ops([self.c.op], [self.g.op])
     self.assertEqual(len(ops), 2)
 
+    ops = ge.get_walks_intersection_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.a.op in ops)
+    self.assertTrue(self.c.op in ops)
+    self.assertTrue(self.f.op in ops)
+
+    within_ops = [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops=within_ops)
+    self.assertEqual(len(ops), 0)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 0)
+
   def test_get_walks_union(self):
     """Test for ge.get_walks_union_ops."""
     ops = ge.get_walks_union_ops([self.f.op], [self.g.op])
     self.assertEqual(len(ops), 6)
 
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 8)
+
+    within_ops = [self.a.op, self.c.op, self.d.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops=within_ops)
+    self.assertEqual(len(ops), 4)
+    self.assertTrue(self.b.op not in ops)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.c.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.b.op not in ops)
+    self.assertTrue(self.d.op not in ops)
+
   def test_select_ops(self):
     parameters = (
         (("^foo/",), 7),
         (("^foo/bar/",), 4),
-        (("^foo/bar/", "a"), 5),)
+        (("^foo/bar/", "a"), 5),
+    )
     for param, length in parameters:
       ops = ge.select_ops(*param, graph=self.graph)
       self.assertEqual(len(ops), length)
@@ -152,7 +183,8 @@ class SelectTest(test.TestCase):
   def test_select_ts(self):
     parameters = (
         (".*:0", 8),
-        (r".*/bar/\w+:0", 4),)
+        (r".*/bar/\w+:0", 4),
+    )
     for regex, length in parameters:
       ts = ge.select_ts(regex, graph=self.graph)
       self.assertEqual(len(ts), length)
@@ -160,12 +192,121 @@ class SelectTest(test.TestCase):
   def test_select_ops_and_ts(self):
     parameters = (
         (("^foo/.*",), 7, 0),
-        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),)
+        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),
+    )
     for param, l0, l1 in parameters:
       ops, ts = ge.select_ops_and_ts(*param, graph=self.graph)
       self.assertEqual(len(ops), l0)
       self.assertEqual(len(ts), l1)
 
+  def test_forward_walk_ops(self):
+    seed_ops = [self.a.op, self.d.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.e.op.
+    within_ops_fn = lambda op: op not in (self.e.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # No b.op since it's an independent source node.
+      # No g.op from within_ops.
+      # No e.op from within_ops_fn.
+      # No h.op from stop_at_ts and within_ops.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(
+          set(ops), set([self.a.op, self.c.op, self.d.op, self.f.op]))
+
+      # Also no a.op and d.op when inclusive=False
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.f.op]))
+
+      # Not using within_ops_fn adds e.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.e.op, self.f.op]))
+
+      # Not using stop_at_ts adds back h.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops, inclusive=False, within_ops=within_ops)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.h.op]))
+
+      # Starting just form a (the tensor, not op) omits a, b, d.
+      ops = ge.select.get_forward_walk_ops([self.a], inclusive=True)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.g.op,
+                         self.h.op]))
+
+  def test_backward_walk_ops(self):
+    seed_ops = [self.h.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.c.op.
+    within_ops_fn = lambda op: op not in (self.c.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # Backward walk only includes h since we stop at f and g is not within.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.h.op]))
+
+      # If we do inclusive=False, the result is empty.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set())
+
+      # Removing stop_at_fs adds f.op, d.op.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn)
+      self.assertEqual(set(ops), set([self.d.op, self.f.op, self.h.op]))
+
+      # Not using within_ops_fn adds back ops for a, b, c.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops, inclusive=True, within_ops=within_ops)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.h.op
+          ]))
+
+      # Vanially backward search via self.h.op includes everything excpet e.op.
+      ops = ge.select.get_backward_walk_ops(seed_ops, inclusive=True)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.g.op,
+              self.h.op
+          ]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index 2603de640735a612cbd883cc6227fe3cd9f11fca..97f38c923f4a19cedf3e16203ca1e66b7e5e45d2 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -18,9 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import numpy as np
 from tensorflow.contrib import graph_editor as ge
 from tensorflow.contrib.graph_editor.tests import match
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,6 +44,7 @@ class TransformTest(test.TestCase):
     self.graph = ops.Graph()
     with self.graph.as_default():
       c0 = constant_op.constant(1.0, shape=[10], name="Const")
+      c0.op._set_attr("_foo", attr_value_pb2.AttrValue(s=b"foo"))
       c1 = constant_op.constant(1.0, shape=[10], name="Const")
       c2 = constant_op.constant(1.0, shape=[10], name="Const")
       i = constant_op.constant(1.0, shape=[10], name="Input")
@@ -112,6 +115,32 @@ class TransformTest(test.TestCase):
     top = ge.select_ops("^AddNoise_2$", graph=graph)[0]
     self.assertTrue(matcher2(top))
 
+  def test_transform_nodedef_fn(self):
+    transformer = ge.Transformer()
+
+    def nodedef_fn(node_def):
+      if "_foo" in node_def.attr:
+        del node_def.attr["_foo"]
+      node_def.attr["_bar"].s = b"bar"
+      return node_def
+
+    my_copy_op_handler = functools.partial(
+        ge.transform.copy_op_handler, nodedef_fn=nodedef_fn)
+    transformer.transform_op_handler = my_copy_op_handler
+
+    graph = ops.Graph()
+    transformer(self.graph, graph, "", "")
+
+    c0_before = self.graph.get_operation_by_name("Const")
+    c0_after = graph.get_operation_by_name("Const")
+    self.assertEquals(c0_before.get_attr("_foo"), b"foo")
+    with self.assertRaises(ValueError):
+      c0_after.get_attr("_foo")
+
+    all_ops = graph.get_operations()
+    for op in all_ops:
+      self.assertEquals(op.get_attr("_bar"), b"bar")
+
   def test_copy_with_input_replacements(self):
     with self.graph.as_default():
       ten = constant_op.constant(10.0, shape=[10], name="Input")
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index d8a48387a745e7d88cc6a74c96cb21a2ba1cfa1f..a320a3f232fc1dc8c9ccfd1d0f2a9a40225db5cb 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -129,7 +129,7 @@ def transform_op_if_inside_handler(info, op, keep_if_possible=True):
       return None
 
 
-def copy_op_handler(info, op, new_inputs, copy_shape=True):
+def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
   """Copy a `tf.Operation`.
 
   Args:
@@ -137,6 +137,11 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True):
     op: the `tf.Operation` to be copied.
     new_inputs: The new inputs for this op.
     copy_shape: also copy the shape of the tensor
+    nodedef_fn: If provided, a function that will be run on the NodeDef
+      and should return a mutated NodeDef before a new Operation is created.
+      This is useful as certain features cannot be set on the Operation and
+      must be modified in NodeDef.
+
   Returns:
     A `(op, op_outputs)` tuple containing the transformed op and its outputs.
   """
@@ -155,6 +160,10 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True):
   name_ = info.graph_.unique_name(name_)
   node_def_.name = name_
 
+  # Mutate NodeDef if requested:
+  if nodedef_fn is not None:
+    node_def_ = nodedef_fn(node_def_)
+
   # Copy the other inputs needed for initialization
   output_types_ = op._output_types[:]
   input_types_ = op._input_types[:]
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 163993a3f6bb1bedcdffb32944a98c7cc846878e..68e34f3b0938f795c8ad4c8c75226f6b0afe188d 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -42,11 +42,12 @@ If you've finished walking through the quick start guide, you may want to try bu
 
 ### Build libhexagon\_nn\_skel.so
 
-Download Hexagon NN library from codeaurora.org and build it.
+Download Hexagon NN library from codeaurora.org and build it. For Hexagon SDK 3.0, we need use the compatible version([721b2d58f](https://source.codeaurora.org/quic/hexagon_nn/nnlib/commit/?id=721b2d58f0f4e2d5b182f41e6b7c4db5356bf0fb)) of nnlib.
 
 ```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
+git reset 721b2d58f --hard
 ```
 
 Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd507faac66b3a33d3c02ec9b5901d814a..645abbf0b0ea5465dadf55d065e997e16940c18d 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,7 +53,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b22ad6432baed2cc96711da5ca995875..ca49635d5d0bc7bb84b19508a74be74362d96ddf 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d054a64ba94141c092e20df1ed6b2339b..ebdcaea7abae2a967786831b62b331897aa3f6a3 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -93,7 +93,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d6950815bd39a64e86e0f7422e6f799..bd784c6bda0344c092c1ae0af2c60be50fdff102 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8d682d6b87813c3a21703ffa762f28e..cd984c80543886be1f682933e2e003bd3374e425 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -433,7 +433,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb52511d4093587814100b2a0e8b2420..0ceb683ff4c6965a5ee4bcb04846a69d4d8ea0a5 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 091f0a109801065f06110e2a313c24486d38109f..9a721a9d440e66eb30bb94daf2b6878318f1e75f 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
   r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
   $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
-  ```
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `\\(Omega \in R^{d x D}\\)` and a bias vector
-  `\\(b \in R^D\\)` where `d` is the input dimension (number of dense input
-  features) and `D` is the output dimension (i.e., dimension of the feature
-  space the input is mapped to). Each entry of `Omega` is sampled i.i.d. from a
-  (scaled) Gaussian distribution and each entry of `b` is sampled independently
-  and uniformly from [0, \\(2 * pi\\)].
-
-  For a single input feature vector x in R^d, its RFFM is defined as:
-  ```
-      $$sqrt(2/D) * cos(x * Omega + b)$$
-  ```
-  where `cos` is the element-wise cosine function and `x, b` are represented as
-  row vectors. The aforementioned paper shows that the linear kernel of
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
+  independently and uniformly from [0, \\(2 * \pi\\)].
+
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
+
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
+  represented as row vectors. The aforementioned paper shows that the linear
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
+  vectors.
 
   """
 
   def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
 
     Args:
       input_dim: The dimension (number of features) of the tensors to be mapped.
@@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
-        the mapper. For repeatable sequences across different invocations of the
-        mapper object (for instance, to ensure consistent mapping both at
-        training and eval/inference if these happen in different invocations),
-        set this to the same integer.
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
+        \\(b\\)) of the mapper. For repeatable sequences across different
+        invocations of the mapper object (for instance, to ensure consistent
+        mapping both at training and eval/inference if these happen in
+        different invocations), set this to the same integer.
       name: name for the mapper object.
     """
     # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index 3aa52aff196fd2699559f80b0c226f470c94b2a3..2c1f09936073a34816da61d771f59e848b8787af 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -32,7 +32,7 @@ flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
 
 
 def main(unused_argv):
-  convnet.train_mnist_single_gpu(FLAGS.data_dir, num_epochs=200)
+  convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 411da033c3a0d5e2148c02207f6e04efcd2a0efc..366e2a82d56602de0df706cbd382c21aba5540af 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -28,6 +28,7 @@ from collections import defaultdict
 from collections import OrderedDict
 from contextlib import contextmanager
 from functools import partial
+import warnings
 
 import math
 import six
@@ -171,6 +172,9 @@ class LayerCollection(object):
   def __init__(self,
                graph=None,
                name="LayerCollection"):
+    warnings.warn(
+        "tf.contrib.kfac is deprecated and will be removed by 2018-11-01. "
+        "Use https://pypi.python.org/pypi/kfac instead.")
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
     self._linked_parameters = dict(
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index f701647c2b297015f025eb53bd191a1a8c54ec62..28ddaa69a14776e0c157c2e68105ee9e17bc3cbb 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -200,7 +200,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_large_batch(self):
-    """Tests with large batch size to force multithreding.
+    """Tests with large batch size to force multithreading.
     """
     batch_size = 5000
     col1 = []
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index ffa208540dae975cb139ad6d76dcf392678ba0ee..49c3faf3b7f5eaa3b1542a1fdddcfaff99737a24 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -140,6 +140,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != "sum":
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -188,13 +191,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 def scattered_embedding_lookup(params,
                                values,
                                dimension,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 9ccb589d698ad83c9654f5523ccdcb35b031b3da..3ae07cedab0be2da8ec633cfd84e07cfdfb11457 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -48,7 +48,7 @@ you should choose depends on (1) the feature type and (2) the model type.
    recommended.
 
      embedded_dept_column = embedding_column(
-       sparse_column_with_keys("department", ["math", "philosphy", ...]),
+       sparse_column_with_keys("department", ["math", "philosophy", ...]),
        dimension=10)
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 78affea44cbfb92523063968dbc1be98841854db..06060b99e7e58787994f20f037ffa451abbc7459 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -815,7 +815,7 @@ class _Transformer(object):
   """
 
   def __init__(self, columns_to_tensors):
-    """Initializes transfomer.
+    """Initializes transformer.
 
     Args:
       columns_to_tensors: A mapping from feature columns to tensors. 'string'
@@ -908,7 +908,7 @@ def _gather_feature_columns(feature_columns):
 
 
 def _check_forbidden_sequence_columns(feature_columns):
-  """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
+  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
   all_feature_columns = _gather_feature_columns(feature_columns)
   for feature_column in all_feature_columns:
     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 10d7f6d076b4b4c6578d7adcffc4e9cc44d77ac6..151fc7a0d734fe8ea4d7872a4051e82d317a500e 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1410,7 +1542,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1554,7 +1686,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a97567adbd7ffa3e81a31d2ae0bad7e..b01fd5d5c95ac15c76f9dbe7c77f7e76f12149a9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index e49589ddf627aa456496cebb2d0fc72fcdad710f..02d294c68f1e10108d774c5fe23b6371a7a9f0e6 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -247,9 +247,7 @@ class RevBlock(base.Layer):
     f_vars_idxs = [[] for _ in range(self.num_layers)]
     g_vars_idxs = [[] for _ in range(self.num_layers)]
 
-    for i, t in enumerate(variables):
-      ref = _underlying_variable_ref(t)
-
+    for i, ref in enumerate(variables):
       # Use the name to identify the layer number and function (f or g)
       regex = LAYER_RE.match(ref.name)
       layer_no = int(regex.group(1))
@@ -604,6 +602,7 @@ def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
     """Custom grad fn applying grad_fn for identity Defun."""
     fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
         defun_inputs, list(op.inputs))
+    fn_vars = [_underlying_variable_ref(v) for v in fn_vars]
     dys = list(dys)
     assert len(fn_outputs) == len(outputs)
     assert len(fn_outputs) == len(dys)
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index d1ad4e8c98de3e5c5ac212d55cc93707ba9c01cc..8c118402a4c85d4b0504754fcd0436ce8b00862d 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
 
-      self.assertAllClose(x1, x1_inv)
-      self.assertAllClose(x2, x2_inv)
+      self.assertAllClose(x1, x1_inv, atol=1e-5)
+      self.assertAllClose(x2, x2_inv, atol=1e-5)
 
   def testBackwardForward(self):
 
@@ -304,6 +304,20 @@ class RecomputeTest(test.TestCase):
           self.assertAllClose(current, g)
           current = g
 
+  def testResourceVariable(self):
+    @rev_block_lib.recompute_grad(tupleize_grads=True)
+    def layer_with_recompute(inputs):
+      var = variable_scope.get_variable("var", ())
+      return var * inputs
+
+    inputs = array_ops.ones((), dtypes.float32)
+    with variable_scope.variable_scope("layer", use_resource=True):
+      outputs = layer_with_recompute(inputs)
+      loss = math_ops.square(outputs)
+      grads = gradients_impl.gradients(loss, variables.trainable_variables())
+      self.assertEqual(1, len(grads))
+      self.assertTrue(grads[0] is not None)
+
 
 class FnWithCustomGradTest(test.TestCase):
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8c85c431be69caaca6872111896b9487faf9e679..14ee2ba6094760d52180d6de7763ea88b8ee98c8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -299,6 +299,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # so instead of breaking compatibility with that assumption, we
     # just manually initialize this field:
     self._train_distribute = None
+    self._device_fn = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index 82848be7df653dd60219317d28f233767746f544..1f439965daf956665bbedc919281df0ee07b5d62 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os.path
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -26,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.learn.python.learn.learn_io import *
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 # pylint: enable=wildcard-import
@@ -35,6 +37,13 @@ class DataFeederTest(test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def setUp(self):
+    self._base_dir = os.path.join(self.get_temp_dir(), 'base_dir')
+    file_io.create_dir(self._base_dir)
+
+  def tearDown(self):
+    file_io.delete_recursively(self._base_dir)
+
   def _wrap_dict(self, data, prepend=''):
     return {prepend + '1': data, prepend + '2': data}
 
@@ -45,14 +54,14 @@ class DataFeederTest(test.TestCase):
   def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
     feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
     if isinstance(input_data, dict):
-      for k, v in list(feeder.input_dtype.items()):
+      for v in list(feeder.input_dtype.values()):
         self.assertEqual(expected_np_dtype, v)
     else:
       self.assertEqual(expected_np_dtype, feeder.input_dtype)
     with ops.Graph().as_default() as g, self.test_session(g):
       inp, _ = feeder.input_builder()
       if isinstance(inp, dict):
-        for k, v in list(inp.items()):
+        for v in list(inp.values()):
           self.assertEqual(expected_tf_dtype, v.dtype)
       else:
         self.assertEqual(expected_tf_dtype, inp.dtype)
@@ -301,7 +310,10 @@ class DataFeederTest(test.TestCase):
                                                 [0.60000002, 0.2]])
       self.assertAllClose(feed_dict[out.name], [[0., 0., 1.], [0., 1., 0.]])
 
-  def test_hdf5_data_feeder(self):
+  # TODO(rohanj): Fix this test by fixing data_feeder. Currently, h5py doesn't
+  # support permutation based indexing lookups (More documentation at
+  # http://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing)
+  def DISABLED_test_hdf5_data_feeder(self):
 
     def func(df):
       inp, out = df.input_builder()
@@ -314,11 +326,12 @@ class DataFeederTest(test.TestCase):
       import h5py  # pylint: disable=g-import-not-at-top
       x = np.matrix([[1, 2], [3, 4]])
       y = np.array([1, 2])
-      h5f = h5py.File('test_hdf5.h5', 'w')
+      file_path = os.path.join(self._base_dir, 'test_hdf5.h5')
+      h5f = h5py.File(file_path, 'w')
       h5f.create_dataset('x', data=x)
       h5f.create_dataset('y', data=y)
       h5f.close()
-      h5f = h5py.File('test_hdf5.h5', 'r')
+      h5f = h5py.File(file_path, 'r')
       x = h5f['x']
       y = h5f['y']
       func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index a7812f74d1e69276a4bba597b41e442bc4dbbc4a..8b7ff75ba5dc4edd01e7dc925e9c90c363f23a33 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -58,6 +58,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 4,
+    shard_count = 5,
     tags = ["noasan"],
 )
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
index cc1a047d6a2b6029080fad3f240aa00f50504f07..e7407ede11409a47f4d9db96ad5b5d801ef1625d 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
@@ -76,6 +76,8 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((1, 1)),
         build_info((1, 3, 3)),
         build_info((5, 5), blocks=[(2, 2), (3, 3)]),
+        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
+        build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
   def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
@@ -184,70 +186,5 @@ class SquareLinearOperatorBlockDiagTest(
       block_diag.LinearOperatorBlockDiag([])
 
 
-# This test is for blocks with different batch dimensions.
-# LinearOperatorFullMatrix doesn't broadcast matmul/solve.
-class SquareDiagLinearOperatorBlockDiagTest(
-    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
-  """Most tests done in the base class LinearOperatorDerivedClassTest."""
-
-  def setUp(self):
-    # Increase from 1e-6 to 1e-4
-    self._atol[dtypes.float32] = 1e-4
-    self._atol[dtypes.complex64] = 1e-4
-    self._rtol[dtypes.float32] = 1e-4
-    self._rtol[dtypes.complex64] = 1e-4
-
-  @property
-  def _operator_build_infos(self):
-    build_info = linear_operator_test_util.OperatorBuildInfo
-    return [
-        build_info((3, 7, 7), blocks=[(1, 2, 2), (3, 2, 2), (1, 3, 3)]),
-        build_info((2, 1, 6, 6), blocks=[(2, 1, 2, 2), (1, 1, 4, 4)]),
-    ]
-
-  def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
-    shape = list(build_info.shape)
-    expected_blocks = (
-        build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
-        else [shape])
-    diag_matrices = [
-        linear_operator_test_util.random_uniform(
-            shape=block_shape[:-1], minval=1., maxval=20., dtype=dtype)
-        for block_shape in expected_blocks
-    ]
-
-    if use_placeholder:
-      diag_matrices_ph = [
-          array_ops.placeholder(dtype=dtype) for _ in expected_blocks
-      ]
-      diag_matrices = self.evaluate(diag_matrices)
-      # Evaluate here because (i) you cannot feed a tensor, and (ii)
-      # values are random and we want the same value used for both mat and
-      # feed_dict.
-      operator = block_diag.LinearOperatorBlockDiag(
-          [linalg.LinearOperatorDiag(m_ph) for m_ph in diag_matrices_ph])
-      feed_dict = {m_ph: m for (m_ph, m) in zip(
-          diag_matrices_ph, diag_matrices)}
-    else:
-      operator = block_diag.LinearOperatorBlockDiag(
-          [linalg.LinearOperatorDiag(m) for m in diag_matrices])
-      feed_dict = None
-      # Should be auto-set.
-      self.assertTrue(operator.is_square)
-
-    # Broadcast the shapes.
-    expected_shape = list(build_info.shape)
-
-    matrices = linear_operator_util.broadcast_matrix_batch_dims(
-        [array_ops.matrix_diag(diag_block) for diag_block in diag_matrices])
-
-    block_diag_dense = _block_diag_dense(expected_shape, matrices)
-    if not use_placeholder:
-      block_diag_dense.set_shape(
-          expected_shape[:-2] + [expected_shape[-1], expected_shape[-1]])
-
-    return operator, block_diag_dense, feed_dict
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 79a5928a21cb9a2633b2aac178f185ba333790d6..bed3d5139fcbf9d9e8b85605c752736f26af6793 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -30,6 +30,13 @@ from tensorflow.python.platform import test
 
 class SDCALogisticClassifierTest(test.TestCase):
 
+  def _single_threaded_test_session(self):
+    # TODO(andreasst): figure out why SDCALinearRegressor needs a single
+    # threaded session to pass in tsan mode but SDCALogisticClassifier does not.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
+    return self.test_session(config=config)
+
   def testRealValuedFeatures(self):
     """Tests SDCALogisticClassifier works with real valued features."""
 
@@ -41,7 +48,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       maintenance_cost = feature_column_lib.real_valued_column(
           'maintenance_cost')
       sq_footage = feature_column_lib.real_valued_column('sq_footage')
@@ -66,7 +73,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       dense_feature = feature_column_lib.real_valued_column(
           'dense_feature', dimension=2)
       classifier = sdca_estimator.SDCALogisticClassifier(
@@ -86,7 +93,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('price'),
           boundaries=[500.0, 700.0])
@@ -120,7 +127,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
@@ -151,7 +158,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 5])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
       country_weighted_by_price = feature_column_lib.weighted_sparse_column(
@@ -163,6 +170,38 @@ class SDCALogisticClassifierTest(test.TestCase):
       metrics = classifier.evaluate(input_fn=input_fn, steps=1)
       self.assertGreater(metrics['accuracy'], 0.9)
 
+  def testSparseFeaturesWithDuplicates(self):
+    """Tests SDCALogisticClassifier with duplicated sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2']),
+          'age':
+              sparse_tensor.SparseTensor(
+                  values=['20-29'] * 5 + ['31-40'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+          'gender':
+              sparse_tensor.SparseTensor(
+                  values=['m'] * 5 + ['f'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+      }, constant_op.constant([[1], [0]])
+
+    with self._single_threaded_test_session():
+      age = feature_column_lib.sparse_column_with_hash_bucket(
+          'age', hash_bucket_size=10)
+      gender = feature_column_lib.sparse_column_with_hash_bucket(
+          'gender', hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[age, gender])
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertLess(metrics['loss'], 0.060)
+
   def testCrossedFeatures(self):
     """Tests SDCALogisticClassifier with crossed features."""
 
@@ -182,7 +221,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 1])
       }, constant_op.constant([[0], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       language = feature_column_lib.sparse_column_with_hash_bucket(
           'language', hash_bucket_size=5)
       country = feature_column_lib.sparse_column_with_hash_bucket(
@@ -215,7 +254,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[3.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       sq_footage_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('sq_footage'),
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index dffdddacfb575636ea0b17797caf8502b8b8098c..5d4572bf6c761e0de2c9e6d7e17193abf0ebb170 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -181,28 +182,42 @@ class SDCAOptimizer(object):
         elif isinstance(
             column,
             (
+                layers.feature_column._WeightedSparseColumn,  # pylint: disable=protected-access
                 layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                 layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-          sparse_features.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=transformed_tensor.indices,
-                          num_or_size_splits=2,
-                          axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]), None))
-          sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-          id_tensor = column.id_tensor(transformed_tensor)
-          weight_tensor = column.weight_tensor(transformed_tensor)
+
+          if isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
+            id_tensor = column.id_tensor(transformed_tensor)
+            weight_tensor = array_ops.reshape(
+                column.weight_tensor(transformed_tensor).values, [-1])
+          else:
+            id_tensor = transformed_tensor
+            weight_tensor = array_ops.ones(
+                [array_ops.shape(id_tensor.indices)[0]], dtypes.float32)
+
+          example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1])
+
+          flat_ids = array_ops.reshape(id_tensor.values, [-1])
+          projection_length = math_ops.reduce_max(flat_ids) + 1
+          # project ids based on example ids so that we can dedup ids that
+          # occur multiple times for a single example.
+          projected_ids = projection_length * example_ids + flat_ids
+
+          # Remove any redudant ids.
+          ids, idx = array_ops.unique(projected_ids)
+          # Keep only one example id per duplicated ids.
+          example_ids_filtered = math_ops.unsorted_segment_min(
+              example_ids, idx,
+              array_ops.shape(ids)[0])
+
+          # reproject ids back feature id space.
+          reproject_ids = (ids - projection_length * example_ids_filtered)
+
+          weights = array_ops.reshape(
+              math_ops.unsorted_segment_sum(weight_tensor, idx,
+                                            array_ops.shape(ids)[0]), [-1])
           sparse_feature_with_values.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=id_tensor.indices, num_or_size_splits=2, axis=1)
-                      [0], [-1]),
-                  array_ops.reshape(id_tensor.values, [-1]),
-                  array_ops.reshape(weight_tensor.values, [-1])))
+              SparseFeatureColumn(example_ids_filtered, reproject_ids, weights))
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
         else:
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index b4504f246a0f806d35d8c3d659717a86d2f2a4f5..65fba52d461461f4594e2222ef6df3849b741f99 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -90,7 +90,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index 4b322e027d48f4bf9f90d5b873c449d1ec31cc49..a4772731ecda92431c412672610a39c188dabf27 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <utility>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 8e47e2375e2e306c345a2b6caa2411abd9b3ceb0..4f836d367747e06de682b5764206d33f6e2fb983 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/arena_planner.h"
+#include <utility>
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 2813d1c347163e67c70983d3dd49773f4a4b4544..b8f6b7fd59af9834edb4aa7aefa524c25ede66d2 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -200,8 +200,7 @@ def gen_zipped_test_files(name, files):
     native.genrule(
         name = name + "_" + f + ".files",
         cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f +
-               " $(@D) zipped"),
+               + " --zip_to_output " + f + " $(@D)"),
         outs = [out_file],
         tools = [
             ":generate_examples",
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 2b6c24768c0f35b91d0dabf8a5723e73f040cc3b..4910c89eaebabb7bd9a4e003b75fa6de4d5af69d 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -53,6 +53,8 @@ typedef struct {
   TfLitePadding padding;
   int stride_width;
   int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
   TfLiteFusedActivation activation;
 } TfLiteConvParams;
 
@@ -221,6 +223,10 @@ typedef struct {
   int shrink_axis_mask;
 } TfLiteStridedSliceParams;
 
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 17b791e4e2f38d9a1108d35d1298445a1c370727..859bc7ab70dc363e08800ca5c40eb0da6ca426b0 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -80,6 +80,9 @@ typedef enum {
   kTfLiteBuiltinCast = 53,
   kTfLiteBuiltinPrelu = 54,
   kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 45184b05ecefb504c75815ae900f3b605359a443..0b38f43cd32fbdfa0296eec7ef81aab76ebe5461 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -137,6 +137,7 @@ typedef enum {
   kTfLiteUInt8 = 3,
   kTfLiteInt64 = 4,
   kTfLiteString = 5,
+  kTfLiteBool = 6,
 } TfLiteType;
 
 // Parameters for asymmetric quantization. Quantized values can be converted
@@ -155,6 +156,7 @@ typedef union {
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
+  bool* b;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d647ddf2359a57254a959871c13fb94f..840015a7fad173dbd2ea353786871dd4e89abb98 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -36,6 +36,7 @@ ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -91,6 +92,7 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index b0236e9c608ec35437bcfe79c51149a76f9f416e..98d3b5bb8ad45bf34f6996b3361291896a451a6f 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -326,10 +326,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
@@ -373,10 +369,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 61ea5231e352f5e014f9200eccae69548574c034..203924f03d3101130049b9679328fac1e2da02bd 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -302,6 +302,19 @@ Options {
 }
 ```
 
+**LESS**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is less
+  than the corresponding element of the second tensor.
+}
+```
+
 **LOCAL_RESPONSE_NORMALIZATION**
 
 ```
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 4575fe884dc07963df5f0a26c5fe6680d92e409c..f25865460882233d1a0f32fe946214e7dba50dbe 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -337,9 +337,13 @@ TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
     case kTfLiteInt64:
       *bytes = sizeof(int64_t) * count;
       break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
     default:
-      ReportError(&context_,
-                  "Only float32, int32, int64, uint8 supported currently.");
+      ReportError(
+          &context_,
+          "Only float32, int32, int64, uint8, bool supported currently.");
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 77db17878318276c6cf5067274a3af3be262c8e1..df67cce9de5e97017251d6a3c14851d1658941f1 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -48,6 +48,10 @@ template <>
 constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
+template <>
+constexpr TfLiteType typeToTfLiteType<bool>() {
+  return kTfLiteBool;
+}
 
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
@@ -208,7 +212,7 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
-      int node_index) {
+      int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
       return nullptr;
     return &nodes_and_registration_[node_index];
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index b14230acd71b4afcbe0225622810e18817540f94..1dda55b8edf8f85293c473b51b8a19066bac5f73 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -117,6 +117,7 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/uint8.bin",
+        "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 300786c3ca01b12a46f7f9a6fe8fd720f97a79f4..18f64651889d7eeb4be961afc47554cbcc51a410 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -54,6 +54,9 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
+import android.widget.CompoundButton;
+import android.widget.NumberPicker;
+import android.widget.ToggleButton;
 import android.widget.TextView;
 import android.widget.Toast;
 import java.io.IOException;
@@ -82,6 +85,8 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
+  private ToggleButton toggle;
+  private NumberPicker np;
   private ImageClassifier classifier;
 
   /** Max preview width that is guaranteed by Camera2 API */
@@ -289,6 +294,24 @@ public class Camera2BasicFragment extends Fragment
   public void onViewCreated(final View view, Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
+    toggle = (ToggleButton) view.findViewById(R.id.button);
+
+    toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+      public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
+        classifier.setUseNNAPI(isChecked);
+      }
+    });
+
+    np = (NumberPicker) view.findViewById(R.id.np);
+    np.setMinValue(1);
+    np.setMaxValue(10);
+    np.setWrapSelectorWheel(true);
+    np.setOnValueChangedListener(new NumberPicker.OnValueChangeListener() {
+      @Override
+      public void onValueChange(NumberPicker picker, int oldVal, int newVal){
+        classifier.setNumThreads(newVal);
+      }
+    });
   }
 
   /** Load the model and labels. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index c57bb348c5b386a59327c7b1bc769717ca755269..d32c0779101cf8e795ee9d7e970401c2c03bb33a 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -142,6 +142,16 @@ public abstract class ImageClassifier {
     }
   }
 
+  public void setUseNNAPI(Boolean nnapi) {
+    if (tflite != null)
+        tflite.setUseNNAPI(nnapi);
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (tflite != null)
+        tflite.setNumThreads(num_threads);
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index 15305c436e0d997af15a326ab4027ea713ed8098..db557ad62f619e88f72426a48a74bffb0f57b818 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -22,24 +22,59 @@
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
     <FrameLayout
         android:id="@+id/control"
         android:layout_width="match_parent"
-        android:layout_height="112dp"
+        android:layout_height="135dp"
         android:layout_alignParentBottom="true"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentEnd="true"
+        android:layout_alignParentRight="true"
+        android:layout_marginEnd="150dp"
+        android:layout_marginRight="150dp"
         android:background="@color/control_background">
 
-        <TextView android:id="@+id/text"
+        <TextView
+            android:id="@+id/text"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:paddingLeft="80dp"
+            android:paddingLeft="20dp"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
 
     </FrameLayout>
 
+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="@color/control_background">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
index a08ec3eb629250a727cec49a822375fe5569f455..29a033bcd437c951ef6e8ba78f4fc3a0fcafac96 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -21,4 +21,6 @@
     <string name="toggle_turn_on">NN:On</string>
     <string name="toggle_turn_off">NN:Off</string>
     <string name="toggle">Use NNAPI</string>
+    <string name="tflite">tflite</string>
+    <string name="nnapi">NNAPI</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a33959dca4954e3c2aaed987839bdec1ba079b5e..451a1cd248226327f35985c8914bfa737db88ae8 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -212,6 +212,13 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  public void setNumThreads(int num_threads) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The interpreter has already been closed.");
+    }
+    wrapper.setNumThreads(num_threads);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index fc8187acfebf272a72ceb7844333bd589359cc2e..61a552db2303fdddbbf7ff6c14067794d0c30898 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -153,6 +153,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
+  void setNumThreads(int num_threads) {
+    numThreads(interpreterHandle, num_threads);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -321,6 +325,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
+  private static native void numThreads(long interpreterHandle, int num_threads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 844226203bb02f4017b2f04da34ac81ac2b7a191..4c33a2dba4debf726c49016106502b7ef26ef122 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -315,6 +315,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
   interpreter->UseNNAPI(static_cast<bool>(state));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetNumThreads(static_cast<int>(num_threads));
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
     JNIEnv* env, jclass clazz, jint size) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 0e28a77feea41d72be126d6e60fffbe7ce374a76..eaa765cb343e9764bd0ef018d636a76f4b8a13e4 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -61,7 +61,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JZ)
+ *  Signature: (JZ)V
  */
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
@@ -69,6 +69,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
                                                            jlong handle,
                                                            jboolean state);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads);
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index df0f3cbeb0e99c4a7cb6a9c610ce660f06454744..ac7c3f071f4bb9819be4358439f93414aecaa512 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -12,10 +12,7 @@ tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -108,10 +105,7 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":kernel_util",
         "//tensorflow/contrib/lite/testing:util",
@@ -135,12 +129,14 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "arg_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
@@ -156,7 +152,7 @@ cc_library(
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
-        "maximum.cc",
+        "maximum_minimum.cc",
         "mean.cc",
         "mfcc.cc",
         "mul.cc",
@@ -242,10 +238,7 @@ tf_cc_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -258,9 +251,21 @@ tf_cc_test(
     name = "add_test",
     size = "small",
     srcs = ["add_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "arg_max_test",
+    size = "small",
+    srcs = ["arg_max_test.cc"],
     tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
+        "tflite_not_portable_ios",
     ],
     deps = [
         ":builtin_ops",
@@ -274,10 +279,7 @@ tf_cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -290,10 +292,7 @@ tf_cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -306,10 +305,7 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -324,10 +320,7 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -340,10 +333,7 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -368,10 +358,7 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -384,10 +371,7 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -401,10 +385,7 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -430,10 +411,7 @@ tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -446,10 +424,7 @@ tf_cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -462,10 +437,7 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -493,10 +465,7 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -509,10 +478,7 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -525,10 +491,7 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -538,9 +501,9 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "maximum_test",
+    name = "maximum_minimum_test",
     size = "small",
-    srcs = ["maximum_test.cc"],
+    srcs = ["maximum_minimum_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -553,10 +516,7 @@ tf_cc_test(
     name = "mean_test",
     size = "small",
     srcs = ["mean_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -569,10 +529,7 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -585,10 +542,7 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -601,10 +555,7 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -617,10 +568,7 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -634,10 +582,7 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -651,10 +596,7 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -667,10 +609,7 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -683,10 +622,7 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -699,10 +635,7 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -715,10 +648,7 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -732,10 +662,7 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -748,10 +675,7 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -764,10 +688,7 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -781,10 +702,7 @@ tf_cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -798,10 +716,7 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -814,10 +729,7 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -831,10 +743,7 @@ tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -847,10 +756,7 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -864,10 +770,7 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -880,10 +783,7 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -896,10 +796,7 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = [
-        "tflite_not_portable_ios_arm64",
-        "tflite_not_portable_ios_x86_64",
-    ],
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -912,6 +809,21 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "comparisons_test",
+    size = "small",
+    srcs = [
+        "comparisons_test.cc",
+    ],
     tags = [
         "tflite_not_portable_ios_arm64",
         "tflite_not_portable_ios_x86_64",
@@ -924,4 +836,16 @@ tf_cc_test(
     ],
 )
 
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2c5e4ceadbc905d22eb02b450c88745a351f58f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace arg_max {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxis = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  // Make sure the axis is only 1 dimension.
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  // Make sure the axis is only either int32 or int64.
+  TF_LITE_ENSURE(context,
+                 axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
+  switch (params->output_type) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteInt64:
+      output->type = kTfLiteInt64;
+      break;
+    default:
+      context->ReportError(context, "Unknown index output data type");
+      return kTfLiteError;
+  }
+
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32:
+      break;
+
+    default:
+      context->ReportError(context, "Only float32 and int types are supported");
+      return kTfLiteError;
+  }
+
+  // Copy the input dimensions to output except make the last dimension 1.
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  output_size->data[NumDimensions(input) - 1] = 1;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// The current impl actually ignores the axis argument.
+// Only determine the index of the maximum value in the last dimension.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
+  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
+  optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
+                        GetTensorData<data_type>(input), GetTensorDims(input), \
+                        GetTensorData<output_type>(output),                    \
+                        GetTensorDims(output))
+  if (axis->type == kTfLiteInt32) {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  } else {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_ARG_MAX
+
+  return kTfLiteOk;
+}
+
+}  // namespace arg_max
+
+TfLiteRegistration* Register_ARG_MAX() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_max::Prepare,
+                                 arg_max::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_max_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b15fe19ab87027c28bde9eaff7d88d03b2c213
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ArgMaxOpModel : public SingleOpModel {
+ public:
+  ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                TensorType output_type, TensorType index_output_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
+                 CreateArgMaxOptions(builder_, index_output_type).Union());
+    BuildInterpreter({input_shape, {1, 1, 1, 1}});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, GetMaxArgFloat) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
+                               TensorType_INT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
+  ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgOutput64) {
+  ArgMaxOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
+                               TensorType_INT64);
+  model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index bc438f99c6a72fdbc2794dee03524db6a7523834..90edf4f9e3683f2987dd8299a1cd5233caa24479 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -123,6 +123,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        GetTensorDims(op_context.input),                \
                        GetTensorData<int32_t>(op_context.block_shape), \
                        GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.crops),       \
+                       GetTensorDims(op_context.crops),                \
                        GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87c413cb982dafd239818040d067738e786d43ff
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace comparisons {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Don't support string and bool.
+  TF_LITE_ENSURE(context,
+                 input1->type != kTfLiteString || input1->type != kTfLiteBool);
+  // Currently only support tensors have the same type.
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = kTfLiteBool;
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+
+#define TF_LITE_LESS(type, opname)                                          \
+  reference_ops::opname(GetTensorData<type>(input1), GetTensorDims(input1), \
+                        GetTensorData<type>(input2), GetTensorDims(input2), \
+                        GetTensorData<bool>(output), GetTensorDims(output));
+
+  // TODO(renjieliu): Support quantized data.
+  if (requires_broadcast) {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, BroadcastLess);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, BroadcastLess);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, BroadcastLess);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  } else {
+    switch (input1->type) {
+      case kTfLiteFloat32:
+        TF_LITE_LESS(float, Less);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_LESS(int32_t, Less);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_LESS(int64_t, Less);
+        break;
+      default:
+        context->ReportError(context,
+                             "Does not support type other than float|int");
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_LESS
+  return kTfLiteOk;
+}
+
+}  // namespace comparisons
+
+TfLiteRegistration* Register_LESS() {
+  static TfLiteRegistration r = {nullptr, nullptr, comparisons::LessPrepare,
+                                 comparisons::LessEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da2d7f858984a4d3bb09ca8e485fe1599bea7ded
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LessOpModel : public SingleOpModel {
+ public:
+  LessOpModel(std::initializer_list<int> input1_shape,
+              std::initializer_list<int> input2_shape, TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_LESS, BuiltinOptions_LessOptions,
+                 CreateLessOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, LessFloat) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessInt) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcast) {
+  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ArgMaxOpTest, LessBroadcastTwoD) {
+  LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
+                                                   true, false, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 18ff33bf9f55ac1d25bb3392e714686c5305c2b8..3b467b3aa284586ab8e67ede55583adffbe06cc7 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -225,22 +225,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
-  auto computeOutSize = [padding](int imageSize, int filterSize,
-                                  int stride) -> int {
+  auto computeOutSize = [padding](int imageSize, int filterSize, int stride,
+                                  int dilationRate) -> int {
+    int effectiveFilterSize = (filterSize - 1) * dilationRate + 1;
     return padding == kTfLitePaddingSame
                ? (imageSize + stride - 1) / stride
                : padding == kTfLitePaddingValid
-                     ? (imageSize - filterSize + stride) / stride
+                     ? (imageSize - effectiveFilterSize + stride) / stride
                      : 0;
   };
 
-  int outWidth = computeOutSize(width, filter_width, params->stride_width);
-  int outHeight = computeOutSize(height, filter_height, params->stride_height);
+  int outWidth = computeOutSize(width, filter_width, params->stride_width,
+                                params->dilation_width_factor);
+  int outHeight = computeOutSize(height, filter_height, params->stride_height,
+                                 params->dilation_height_factor);
 
   data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, outHeight);
+      ComputePadding(params->stride_height, params->dilation_height_factor,
+                     height, filter_height, outHeight);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, outWidth);
+      ComputePadding(params->stride_width, params->dilation_width_factor, width,
+                     filter_width, outWidth);
 
   TF_LITE_ENSURE(context, hasBias);
 
@@ -375,28 +380,40 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-
-  switch (kernel_type) {
+  KernelType effective_kernel_type;
+  if (((kernel_type == kMultithreadOptimized) ||
+       (kernel_type == kCblasOptimized)) &&
+      ((params->dilation_width_factor != 1) ||
+       (params->dilation_height_factor != 1))) {
+    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // Therefore, fallback to optimized.
+    effective_kernel_type = kGenericOptimized;
+  } else {
+    effective_kernel_type = kernel_type;
+  }
+  switch (effective_kernel_type) {
     case kReference: {
-      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height, 1, 1,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      reference_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kGenericOptimized: {
-      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                          GetTensorData<float>(filter), GetTensorDims(filter),
-                          GetTensorData<float>(bias), GetTensorDims(bias),
-                          params->stride_width, params->stride_height, 1, 1,
-                          data->padding.width, data->padding.height,
-                          output_activation_min, output_activation_max,
-                          GetTensorData<float>(output), GetTensorDims(output),
-                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      optimized_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(filter), GetTensorDims(filter),
+          GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+          params->stride_height, params->dilation_width_factor,
+          params->dilation_height_factor, data->padding.width,
+          data->padding.height, output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
     case kMultithreadOptimized: {
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index d2393c3c97bb9516e2b8a6c8ae037dc0dfdfe64b..0dcfc826fd218d2d2dfbf89201d2c13fbfe6f0e1 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -46,7 +46,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
       TfLiteRegistration* registration, const TensorData& input,
       const TensorData& filter, const TensorData& output, int stride_width = 2,
       int stride_height = 2, enum Padding padding = Padding_VALID,
-      enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -71,8 +72,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
     }
 
     SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
-                 CreateConv2DOptions(builder_, padding, stride_width,
-                                     stride_height, activation)
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
                      .Union());
 
     resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index cad9ce114c8387047af2b63bee704035fd329330..eeda1bc3c5ba2da5b6546ce36925a6f20fc9cbae 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -140,10 +140,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int out_height =
       compute_out_size(height, filter_height, params->stride_height);
 
-  data->padding.height =
-      ComputePadding(params->stride_height, height, filter_height, out_height);
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
   data->padding.width =
-      ComputePadding(params->stride_width, width, filter_width, out_width);
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 6dd243ad62ece3e094529d923ce80d1d4a0c19ca..ec380c8e4956e5bcd0d7559bfd8f89a52d9d233c 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -106,6 +106,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_DIV
 }
 
+
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
@@ -118,7 +120,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (output->type == kTfLiteFloat32) {
     EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Div only supports FLOAT32 and quantized UINT8 now.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 167c0f1fde9202452a915cea69cbb935fa1af7b6..67dd1884966d8addafa54e80f9923aa66354dff9 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -154,6 +154,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
+        ":quantization_util",
         ":types",
         ":round",
         "//third_party/eigen3",
@@ -238,6 +239,7 @@ cc_library(
         "reference/reference_ops.h",
     ],
     deps = [
+        ":quantization_util",
         ":round",
         ":types",
         "//third_party/eigen3",
@@ -430,4 +432,13 @@ cc_library(
     ),
 )
 
+cc_test(
+    name = "batch_to_space_nd_test",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":optimized_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
diff --git a/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a2901ac8c297265e542cc30d3127fe774c19e78
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+// A light wrapper of GetIndexRange which returns a pair of start / end
+// indices.
+std::pair<int, int> GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                                  int input_dim, int output_dim) {
+  int index_start = 0;
+  int index_end = 0;
+  optimized_ops::GetIndexRange(spatial_index_dim, block_shape_dim, input_dim,
+                               output_dim, &index_start, &index_end);
+  return {index_start, index_end};
+}
+
+TEST(BatchToSpaceNDTest, TestIndexRange) {
+  // Simple test case, no cropping.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/6,
+                          /*input_dim=*/1, /*output_dim=*/6),
+            std::make_pair(0, 1));
+
+  // No cropping and input_dim > 1.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/2, /*block_shape_dim=*/6,
+                          /*input_dim=*/5, /*output_dim=*/30),
+            std::make_pair(0, 5));
+
+  // With small cropping values (can be either at the beginning or at the end).
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(0, 2));
+
+  // With positive cropping values at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-2, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(1, 3));
+
+  // Large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-26, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  // Large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/4, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Rounding up incorrectly will fail this test.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Extreme cropping with output of a single spatial location.
+  // Valid position 1, when large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 1));
+
+  // Valid position 2, when large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 7));
+
+  // Invalid positions.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/1, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 0));
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-29, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 6));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 51426bb1c584b82af7b1a2ffaf5a675a1dd9a6fd..93fc6b6a76f67e2e75ba3a766e5ea6fb6bada77a 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -77,6 +77,7 @@ limitations under the License.
 #endif
 
 // TODO(ahentz): Clean up.
+using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 0f78e0f728585ab27a8116a4707ac9614a6ea060..dd6932ffe7b7a6f1101f146ce6472b0df4cbda3b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1696,15 +1696,15 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 #ifdef __aarch64__
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3by3FilterKernelSupported(input_dims, filter_dims, stride_width,
-                                    stride_height, pad_width, pad_height,
-                                    depth_multiplier, output_dims)) {
-    DepthwiseConv3by3FilterDepth16(
-        input_data, input_dims, input_offset, filter_data, filter_dims,
-        filter_offset, bias_data, bias_dims, stride_width, stride_height,
-        pad_width, pad_height, depth_multiplier, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_data, output_dims);
+  if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                   stride_height, pad_width, pad_height,
+                                   depth_multiplier, output_dims)) {
+    DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data,
+                           filter_dims, filter_offset, bias_data, bias_dims,
+                           stride_width, stride_height, pad_width, pad_height,
+                           depth_multiplier, output_offset, output_multiplier,
+                           output_shift, output_activation_min,
+                           output_activation_max, output_data, output_dims);
     return;
   }
 #endif
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index a349892076fcc4989e2f4cad188b383d2b31d470..55e0d5c3aa9ebb8b46403550e190b00a54cb53e5 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,412 +40,4380 @@ inline void preload_l1_keep(const uint8* ptr) {
 // NEON intrinsics vector data types.
 // See: https://bugs.llvm.org/show_bug.cgi?id=34945
 
-struct Int32x16 {
-  int32x4_t v0, v1, v2, v3;
+struct Int32x8 {
+  int32x4_t low, high;
 };
 
-struct Int16x16 {
-  int16x8_t low, high;
+struct Filter3x3x8 {
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
 };
 
-struct Int16x16x3 {
-  Int16x16 v0, v1, v2;
+// Loads 3x3 filter of depth 8 and adds filter offsets.
+inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
+                                 int output_depth) {
+  Filter3x3x8 filter;
+
+  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
+      temp_u8_6, temp_u8_7, temp_u8_8;
+  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+
+  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
+  temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
+  temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
+  temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
+  temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
+  temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
+  temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
+  temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
+  temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
+
+  filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
+  filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
+  filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
+  filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
+  filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
+  filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
+  filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
+  filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
+  filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
+
+  filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
+  filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
+  filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
+  filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
+  filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
+  filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
+  filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
+  filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
+  filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
+
+  return filter;
+}
+
+// Applies activation, offset and downquantize on a set of accumulator
+// registers that correspond to a 2x2 output of depth 8.
+// Stores results to output.
+inline void DownquantizeAndStore2x2Output(
+    Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
+    int32 output_offset, int32 output_multiplier, int output_shift,
+    int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
+    int output_depth, int output_width) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  // Fixed-point multiplication.
+  acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+  acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+  acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+  acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+  acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
+  acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
+  acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
+  acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
+
+  acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+  acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+  acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+  acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+  acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
+  acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
+  acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
+  acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
+
+  // Add the output offset.
+  acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+  acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+  acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+  acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+  acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
+  acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
+  acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
+  acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
+
+  // Apply the activation function.
+  acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+  acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+  acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+  acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+  acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
+  acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
+  acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
+  acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
+
+  acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+  acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+  acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+  acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
+  acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
+  acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
+  acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
+
+  // Saturating cast to uint8 and store to destination.
+  int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+  int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+  int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+  int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+  int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
+  int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
+  int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
+  int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
+
+  int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
+  int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
+  uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
+
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_depth, res_1_u8);
+  vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
+  vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
+}
+
+inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 uint8* output_ptr) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
+  acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
+
+  acc.low = RoundingDivideByPOT(acc.low, output_shift);
+  acc.high = RoundingDivideByPOT(acc.high, output_shift);
+
+  acc.low = vaddq_s32(acc.low, output_offset_vec);
+  acc.high = vaddq_s32(acc.high, output_offset_vec);
+
+  acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
+  acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
+
+  acc.low = vminq_s32(acc.low, output_activation_max_vec);
+  acc.high = vminq_s32(acc.high, output_activation_max_vec);
+
+  int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
+  int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
+
+  int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
+  uint8x8_t res_u8 = vqmovun_s16(res_s16);
+  vst1_u8(output_ptr, res_u8);
+}
+
+inline void DownquantizeAndStore2Output(
+    Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  {
+    using gemmlowp::RoundingDivideByPOT;
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    const int32x4_t output_activation_min_vec =
+        vdupq_n_s32(output_activation_min);
+    const int32x4_t output_activation_max_vec =
+        vdupq_n_s32(output_activation_max);
+
+    // Fixed-point multiplication.
+    acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+    acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+    acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+    acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+
+    acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+    acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+    acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+    acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+
+    // Add the output offset.
+    acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+    acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+    acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+    acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+
+    // Apply the activation function.
+    acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+    acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+    acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+    acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+
+    acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+    acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+    acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+    acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  }
+
+  // Saturating cast to uint8 and store to destination.
+  int16x8_t res_0_s16;
+  {
+    int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+    int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+    res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  }
+
+  int16x8_t res_1_s16;
+  {
+    int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+    int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+    res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  }
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
+                                     int16x8_t f2, int16x8_t i0, int16x8_t i1,
+                                     int16x8_t i2) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
+  return accum;
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
+                                           int16x8_t i1, int16x8_t i2,
+                                           int16x8_t i3, int16x8_t i4,
+                                           int16x8_t i5, int16x8_t i6,
+                                           int16x8_t i7, int16x8_t i8,
+                                           Int32x8 accum) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
+  return accum;
+}
+
+inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
+                               int16x8_t i1, int16x8_t i2, int16x8_t i3,
+                               int16x8_t i4, int16x8_t i5, int16x8_t i6,
+                               int16x8_t i7, int16x8_t i8,
+                               const int32* bias_ptr, int32 output_offset,
+                               int32 output_multiplier, int output_shift,
+                               int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_ptr) {
+  Int32x8 acc;
+  acc.low = vld1q_s32(bias_ptr);
+  acc.high = vld1q_s32(bias_ptr + 4);
+
+  acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
+                                    acc);
+
+  DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_ptr);
+}
+
+// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
+inline void DotProductAndStore2xStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
+                                      i10, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
+inline void DotProductAndStore2yStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
+                                      i8, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// A kernel that is optimized on the number of output cells in the x and y
+// direction, and the stride. Assumes 3x3 filters of 8 depth.
+template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth,
+          int kFixedStrideHeight>
+struct ConvKernel3x3FilterDepth8 {};
+
+template <>
+struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    //
+    //        INPUT                          OUTPUT
+    //   |\----------------\               |\------------\
+    //   | \                \              | \            \
+    //   |  \----------------\             |  \------------\
+    //   |  | 0    ...     9 |             |  | 0  ...   7 |
+    //   |  | 10   ...    19 |     --->    |  | 8  ...  15 |
+    //   |  | 20   ...    29 |              \ | .. ...  .. |
+    //    \ | ..   ...    .. |               \| 56 ...  63 |
+    //     \| 90   ...   109 |                |------------|
+    //      |----------------|
+    //
+    // The first set of loads corresponds to:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-----------------                |\-----------
+    //   | \                                | \
+    //   |  \-----------------              |  \----------
+    //   |  | 0  1   2  3 ...               |  | 0  1 ...
+    //   |  | 10 11 12 13 ...     --->      |  | ..   ...
+    //   |  | 20 21 22 23 ...                  | ..   ...
+    //   |  | ..   ...    ...
+    //
+    // The next set of loads correspond to a sliding window to the right.
+    // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-------------------                |\-------------
+    //   | \                                  | \
+    //   |  \-------------------              |  \------------
+    //   |  | .. 2  3   4  5 ...              |  | .. 2  3 ...
+    //   |  | .. 12 13 14 15 ...     --->     |  | ..      ...
+    //   |  | .. 21 22 23 24 ...                 | ..      ...
+    //   |  | ..    ...      ...
+    //
+    // And so on...
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (0) and (1).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (2) and (3).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (4) and (5).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth, output_depth);
+
+    // Slide to the right one last time for outputs x = [6, 7], y = 0.
+    // Referring to the indexes in the diagram above, this corresponds to
+    // outputs (6) and (7).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth, output_depth);
+
+    // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (14) and (15).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (12) and (13).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
+    // in the diagram above, this corresponds to outputs (10) and (11).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (8) and (9).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (16) and (17).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (18) and (19).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (20) and (21).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (22) and (23).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (30) and (31).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (28) and (29).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (26) and (27).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (24) and (25).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (32) and (33).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (34) and (35).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (36) and (37).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (38) and (39).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (46) and (47).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (44) and (45).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (42) and (43).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (40) and (41).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 5 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (48) and (49).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 8 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (50) and (51).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (52) and (53).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (54) and (55).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (62) and (63).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (60) and (61).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (58) and (59).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (56) and (57).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 7 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next inputs one row down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load last row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 2x1 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_row_size);
+
+    // Load inputs for bottom 2 rows.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
+        input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size,
+        output_row_size);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
+      // Load inputs for the top two filters first.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9, input_10, input_11;
+
+      const uint8* ptr = input_ptr;
+
+      // Load top 3 rows.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+        input_10 = vaddq_s16(input_10, input_offset_vec);
+        input_11 = vaddq_s16(input_11, input_offset_vec);
+      }
+
+      // Multiply-accum for top-left output.
+      acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
+                                          input_4, input_5, input_6, input_8,
+                                          input_9, input_10, acc_0);
+
+      // Multiply-accum for top-right output.
+      acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
+                                          input_5, input_6, input_7, input_9,
+                                          input_10, input_11, acc_1);
+
+      // Now load the bottom row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+      }
+
+      // Multiply-accum for bottom-left output.
+      acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
+                                          input_8, input_9, input_10, input_0,
+                                          input_1, input_2, acc_2);
+
+      // Multiply-accum for bottom-right output.
+      acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
+                                          input_9, input_10, input_11, input_1,
+                                          input_2, input_3, acc_3);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_depth * 4;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
+    // Load all inputs at the beginning.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth * output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9;
+
+    const uint8* ptr = input_ptr;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+    // Load first 2 rows.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load last row.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 4x2 kernel twice.
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
+        input_1, input_2, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter at stride 2, we require
+      // 5x5 inputs. We load the first 5x2 inputs at a time.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9;
+
+      const uint8* ptr = input_ptr;
+
+      // Load inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load next inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+
+      // Moving onto the two bottom outputs.
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load last input row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+      }
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
 };
 
-struct Filter3x3x16 {
-  Int16x16x3 r0, r1, r2;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 2x2 kernel twice.
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
 };
 
-// Loads 3x3 filter of depth 16 and adds filter offsets.
-inline Filter3x3x16 LoadFilterDepth16(const uint8* filter_ptr,
-                                      int32 filter_offset, int output_depth) {
-  Filter3x3x16 filter;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
 
-  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
-      temp_u8_6, temp_u8_7, temp_u8_8, temp_u8_9, temp_u8_10, temp_u8_11,
-      temp_u8_12, temp_u8_13, temp_u8_14, temp_u8_15, temp_u8_16, temp_u8_17;
-  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
 
-  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
-  temp_u8_1 = vld1_u8(filter_ptr + 0 * output_depth + 8);
-  temp_u8_2 = vld1_u8(filter_ptr + 1 * output_depth);
-  temp_u8_3 = vld1_u8(filter_ptr + 1 * output_depth + 8);
-  temp_u8_4 = vld1_u8(filter_ptr + 2 * output_depth);
-  temp_u8_5 = vld1_u8(filter_ptr + 2 * output_depth + 8);
-
-  temp_u8_6 = vld1_u8(filter_ptr + 3 * output_depth);
-  temp_u8_7 = vld1_u8(filter_ptr + 3 * output_depth + 8);
-  temp_u8_8 = vld1_u8(filter_ptr + 4 * output_depth);
-  temp_u8_9 = vld1_u8(filter_ptr + 4 * output_depth + 8);
-  temp_u8_10 = vld1_u8(filter_ptr + 5 * output_depth);
-  temp_u8_11 = vld1_u8(filter_ptr + 5 * output_depth + 8);
-
-  temp_u8_12 = vld1_u8(filter_ptr + 6 * output_depth);
-  temp_u8_13 = vld1_u8(filter_ptr + 6 * output_depth + 8);
-  temp_u8_14 = vld1_u8(filter_ptr + 7 * output_depth);
-  temp_u8_15 = vld1_u8(filter_ptr + 7 * output_depth + 8);
-  temp_u8_16 = vld1_u8(filter_ptr + 8 * output_depth);
-  temp_u8_17 = vld1_u8(filter_ptr + 8 * output_depth + 8);
-
-  filter.r0.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
-  filter.r0.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
-  filter.r0.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
-  filter.r0.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
-  filter.r0.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
-  filter.r0.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
-
-  filter.r1.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
-  filter.r1.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
-  filter.r1.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
-  filter.r1.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_9));
-  filter.r1.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_10));
-  filter.r1.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_11));
-
-  filter.r2.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_12));
-  filter.r2.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_13));
-  filter.r2.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_14));
-  filter.r2.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_15));
-  filter.r2.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_16));
-  filter.r2.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_17));
-
-  filter.r0.v0.low = vaddq_s16(filter.r0.v0.low, filter_offset_vec);
-  filter.r0.v0.high = vaddq_s16(filter.r0.v0.high, filter_offset_vec);
-  filter.r0.v1.low = vaddq_s16(filter.r0.v1.low, filter_offset_vec);
-  filter.r0.v1.high = vaddq_s16(filter.r0.v1.high, filter_offset_vec);
-  filter.r0.v2.low = vaddq_s16(filter.r0.v2.low, filter_offset_vec);
-  filter.r0.v2.high = vaddq_s16(filter.r0.v2.high, filter_offset_vec);
-
-  filter.r1.v0.low = vaddq_s16(filter.r1.v0.low, filter_offset_vec);
-  filter.r1.v0.high = vaddq_s16(filter.r1.v0.high, filter_offset_vec);
-  filter.r1.v1.low = vaddq_s16(filter.r1.v1.low, filter_offset_vec);
-  filter.r1.v1.high = vaddq_s16(filter.r1.v1.high, filter_offset_vec);
-  filter.r1.v2.low = vaddq_s16(filter.r1.v2.low, filter_offset_vec);
-  filter.r1.v2.high = vaddq_s16(filter.r1.v2.high, filter_offset_vec);
-
-  filter.r2.v0.low = vaddq_s16(filter.r2.v0.low, filter_offset_vec);
-  filter.r2.v0.high = vaddq_s16(filter.r2.v0.high, filter_offset_vec);
-  filter.r2.v1.low = vaddq_s16(filter.r2.v1.low, filter_offset_vec);
-  filter.r2.v1.high = vaddq_s16(filter.r2.v1.high, filter_offset_vec);
-  filter.r2.v2.low = vaddq_s16(filter.r2.v2.low, filter_offset_vec);
-  filter.r2.v2.high = vaddq_s16(filter.r2.v2.high, filter_offset_vec);
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
 
-  return filter;
-}
+    const uint8* ptr = input_ptr;
 
-// Loads 3 input cells of depth 16 and adds input offsets.
-inline Int16x16x3 LoadInputRowDepth16(const uint8* ptr, int input_depth,
-                                      int32 input_offset,
-                                      Int16x16x3 input_row) {
-  uint8x8_t temp_0, temp_1;
-  int16x8_t offset_vec = vdupq_n_s16(input_offset);
-
-  temp_0 = vld1_u8(ptr + 0 * input_depth);
-  temp_1 = vld1_u8(ptr + 0 * input_depth + 8);
-  input_row.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v0.low = vaddq_s16(input_row.v0.low, offset_vec);
-  input_row.v0.high = vaddq_s16(input_row.v0.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 1 * input_depth);
-  temp_1 = vld1_u8(ptr + 1 * input_depth + 8);
-  input_row.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v1.low = vaddq_s16(input_row.v1.low, offset_vec);
-  input_row.v1.high = vaddq_s16(input_row.v1.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 2 * input_depth);
-  temp_1 = vld1_u8(ptr + 2 * input_depth + 8);
-  input_row.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v2.low = vaddq_s16(input_row.v2.low, offset_vec);
-  input_row.v2.high = vaddq_s16(input_row.v2.high, offset_vec);
-
-  return input_row;
-}
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
 
-// Performs multiply accumulate on 3 inputs of depth 16.
-inline Int32x16 MultiplyAccumulateRowDepth16(Int32x16 output,
-                                             const Int16x16x3& filter_row,
-                                             const Int16x16x3& input_row) {
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v0.low),
-                        vget_low_s16(input_row.v0.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v0.low),
-                        vget_high_s16(input_row.v0.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v0.high),
-                        vget_low_s16(input_row.v0.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v0.high),
-                        vget_high_s16(input_row.v0.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v1.low),
-                        vget_low_s16(input_row.v1.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v1.low),
-                        vget_high_s16(input_row.v1.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v1.high),
-                        vget_low_s16(input_row.v1.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v1.high),
-                        vget_high_s16(input_row.v1.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v2.low),
-                        vget_low_s16(input_row.v2.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v2.low),
-                        vget_high_s16(input_row.v2.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v2.high),
-                        vget_low_s16(input_row.v2.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v2.high),
-                        vget_high_s16(input_row.v2.high));
-
-  return output;
-}
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
 
-// Applies activation, offset and downquantize on a set of accumulator
-// registers of depth 16. Stores results to output.
-inline void DownquantizeAndStoreDepth16(Int32x16 acc, int32 output_multiplier,
-                                        int output_shift,
-                                        int32x4_t output_offset_vec,
-                                        int32x4_t output_activation_min_vec,
-                                        int32x4_t output_activation_max_vec,
-                                        uint8* output_ptr) {
-  // Fixed-point multiplication.
-  acc.v0 = vqrdmulhq_n_s32(acc.v0, output_multiplier);
-  acc.v1 = vqrdmulhq_n_s32(acc.v1, output_multiplier);
-  acc.v2 = vqrdmulhq_n_s32(acc.v2, output_multiplier);
-  acc.v3 = vqrdmulhq_n_s32(acc.v3, output_multiplier);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
 
-  using gemmlowp::RoundingDivideByPOT;
-  acc.v0 = RoundingDivideByPOT(acc.v0, output_shift);
-  acc.v1 = RoundingDivideByPOT(acc.v1, output_shift);
-  acc.v2 = RoundingDivideByPOT(acc.v2, output_shift);
-  acc.v3 = RoundingDivideByPOT(acc.v3, output_shift);
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
 
-  // Add the output offset.
-  acc.v0 = vaddq_s32(acc.v0, output_offset_vec);
-  acc.v1 = vaddq_s32(acc.v1, output_offset_vec);
-  acc.v2 = vaddq_s32(acc.v2, output_offset_vec);
-  acc.v3 = vaddq_s32(acc.v3, output_offset_vec);
+    // Second output.
+    output_ptr += output_row_size;
 
-  // Apply the activation function.
-  acc.v0 = vmaxq_s32(acc.v0, output_activation_min_vec);
-  acc.v1 = vmaxq_s32(acc.v1, output_activation_min_vec);
-  acc.v2 = vmaxq_s32(acc.v2, output_activation_min_vec);
-  acc.v3 = vmaxq_s32(acc.v3, output_activation_min_vec);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
 
-  acc.v0 = vminq_s32(acc.v0, output_activation_max_vec);
-  acc.v1 = vminq_s32(acc.v1, output_activation_max_vec);
-  acc.v2 = vminq_s32(acc.v2, output_activation_max_vec);
-  acc.v3 = vminq_s32(acc.v3, output_activation_max_vec);
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
 
-  // Saturating cast to uint8 and store to destination.
-  int16x4_t acc_tlla_s16 = vqmovn_s32(acc.v0);
-  int16x4_t acc_tllb_s16 = vqmovn_s32(acc.v1);
-  int16x4_t acc_tlha_s16 = vqmovn_s32(acc.v2);
-  int16x4_t acc_tlhb_s16 = vqmovn_s32(acc.v3);
-
-  int16x8_t res_s16_0 = vcombine_s16(acc_tlla_s16, acc_tllb_s16);
-  int16x8_t res_s16_1 = vcombine_s16(acc_tlha_s16, acc_tlhb_s16);
-  uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
-  uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
-  vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 5 * input_depth;
+    temp_2 = vld1_u8(ptr);
+    temp_0 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_5 = vld1_u8(ptr);
+    temp_3 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_8 = vld1_u8(ptr);
+    temp_6 = vld1_u8(ptr + input_depth);
+
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
+        input_8, input_6, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 7 * input_depth;
+    temp_1 = vld1_u8(ptr);
+    temp_2 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_4 = vld1_u8(ptr);
+    temp_5 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_7 = vld1_u8(ptr);
+    temp_8 = vld1_u8(ptr + input_depth);
+
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+
+    uint8x8_t temp_0 = vld1_u8(input_ptr);
+    uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_3 = vld1_u8(input_ptr);
+    uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_6 = vld1_u8(input_ptr);
+    uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+inline void ShuffleInput(const uint8* input_ptr, int input_depth,
+                         int input_width, int input_height, int output_depth,
+                         int output_width, int output_height,
+                         uint8* output_ptr) {
+  const int input_row_size = input_depth * input_width;
+
+  for (int y = 0; y < output_height; y++) {
+    const uint8* ptr = input_ptr;
+    for (int x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
 }
 
-// A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 16 depth.
-template <int kFixedOutputX, int kFixedOutputY, int kFixedStride = 1>
-struct ConvKernel3x3FilterDepth16 {};
+template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8 {};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 1x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 1x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <int kFixedStrideWidth, int kFixedStrideHeight>
+struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 2x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 2x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * kFixedStrideWidth * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 2x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>::
+            Run(input_ptr, input_depth, input_offset, input_row_size,
+                filter_ptr, filter_offset, bias_ptr, output_offset,
+                output_multiplier, output_shift, output_activation_min,
+                output_activation_max, output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<4, 1, 1> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += input_depth;
+      output_data += output_depth;
+    }
+  }
+};
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // The inputs to second row of the top filter are also the inputs to the
-      // first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs to the
-      // second row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<4, 2, 2> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Branch and cache misses increase substantially with stride 2 kernels.
+    // Adding prefetching reduces latency by as much as 2x.
+    const int i0 = 0;
+    const int i1 = input_depth;
+    const int i2 = 2 * input_depth;
+    const int i3 = 3 * input_depth;
+    const int i4 = 4 * input_depth;
+    const int i5 = 5 * input_depth;
+    const int i6 = 6 * input_depth;
+    const int i7 = 7 * input_depth;
+    const int i8 = 8 * input_depth;
+
+#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i)         \
+  preload_l1_keep(input_ptr + i * input_row_size + i0); \
+  preload_l1_keep(input_ptr + i * input_row_size + i1); \
+  preload_l1_keep(input_ptr + i * input_row_size + i2); \
+  preload_l1_keep(input_ptr + i * input_row_size + i3); \
+  preload_l1_keep(input_ptr + i * input_row_size + i4); \
+  preload_l1_keep(input_ptr + i * input_row_size + i5); \
+  preload_l1_keep(input_ptr + i * input_row_size + i6); \
+  preload_l1_keep(input_ptr + i * input_row_size + i7); \
+  preload_l1_keep(input_ptr + i * input_row_size + i8);
+
+    int out_x = start_x;
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // Preload 9x9 input.
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+        // For a large input window (64x9x9) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If this size is ever changed,
+        // update the ShuffleWorkspaceSize() function to return the new size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
+                     9, shuffle_workspace);
+        const uint8* shuffled_ptr = &shuffle_workspace[0];
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
+              bias_ptr, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_ptr,
+              output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      // Preload 9x9 input one more time for the rest of the depth.
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * 2 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+#undef DEPTHWISECONV_PRELOAD_ROW
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += output_depth;
+    }
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 2> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs
-      // to first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do second row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 4 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<8, 2, 2> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Reuse 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
+        input_data, start_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
+        input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth16<1, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    Int32x16 acc;
-    acc.v0 = vld1q_s32(bias_ptr);
-    acc.v1 = vld1q_s32(bias_ptr + 4);
-    acc.v2 = vld1q_s32(bias_ptr + 8);
-    acc.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r0, input);
-
-      // Do second row.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r1, input);
-
-      // Do third row.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
+struct ConvRow3x3FilterDepth8<8, 1, 1> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+    // 8x8 at a time.
+    for (; out_x <= output_width - 8; out_x += 8) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // For a large input window (64x10x10) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If the size of the input window
+        // changes, update the function ShuffleWorkspaceSize() with the new
+        // size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
+                     10, shuffle_workspace);
+        const uint8* shuffled_ptr = shuffle_workspace;
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
+              filter_offset, bias_ptr, output_offset, output_multiplier,
+              output_shift, output_activation_min, output_activation_max,
+              output_ptr, output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 8 * input_depth;
+      output_data += 8 * output_depth;
+    }
+
+    // Handle the rest of the right side by re-using 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
+        input_data, out_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
+        input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
   }
 };
 
-inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
-                                          const Dims<4>& filter_dims,
-                                          int stride_width, int stride_height,
-                                          int pad_width, int pad_height,
-                                          int depth_multiplier,
-                                          const Dims<4>& output_dims) {
+inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
+                                         const Dims<4>& filter_dims,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int depth_multiplier,
+                                         const Dims<4>& output_dims) {
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -458,14 +4426,15 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
                    depth_multiplier == 1 &&
                    (stride_width == 1 || stride_width == 2) &&
                    (stride_height == 1 || stride_height == 2) &&
-                   pad_width == 0 && pad_height == 0 && (input_depth % 16) == 0;
+                   (stride_width == stride_height) && pad_width == 0 &&
+                   pad_height == 0 && (input_depth % 8) == 0;
 
   if (!supported) {
     return false;
   }
 
-  // Handle case where padding is zero but type is not kValid. This would
-  // require special boundary case handling that is not supported yet.
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
 
   const int out_x = output_width - 1;
   const int out_y = output_height - 1;
@@ -481,7 +4450,7 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
   return in_x_end <= input_width && in_y_end <= input_height;
 }
 
-inline void DepthwiseConv3by3FilterDepth16(
+inline void DepthwiseConv3x3Filter(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
     const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
     const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
@@ -500,241 +4469,109 @@ inline void DepthwiseConv3by3FilterDepth16(
   const int output_width = ArraySize(output_dims, 1);
 
   // Algorithm assumes below constraints. It is optimized for depth multiplier
-  // of 1, 3x3 filter, no padding, strides 1 and 2.
+  // of 1, 3x3 filter, no padding and strides 1 and 2.
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
   TFLITE_DCHECK(depth_multiplier == 1);
   TFLITE_DCHECK(filter_height == 3);
   TFLITE_DCHECK(filter_width == 3);
   TFLITE_DCHECK(pad_height == 0);
   TFLITE_DCHECK(pad_width == 0);
-  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
   TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
 
-  // The number of outputs to process in the main loop.
-  const int num_x_outputs = 1;
-  const int num_y_outputs = 2;
-
-  const int input_row_width = output_depth * (input_width + 2 * pad_width);
-  const int input_batch_size =
-      input_row_width * (input_height + 2 * pad_height);
+  const int input_row_size = input_depth * (input_width + 2 * pad_width);
+  const int output_row_size = output_depth * output_width;
+  const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
   const int output_batch_size = output_depth * output_width * output_height;
-  const int input_ptr_x_increment = input_depth * stride_width;
 
-  // Calculate extents of non-boundary loop.
-  int out_x_start = 0;
-  for (; out_x_start < input_width; out_x_start++) {
-    int in_x = (out_x_start * stride_width) - pad_width;
-    if (in_x >= 0) {
-      break;
-    }
-  }
-  int out_x_end = output_width - 1;
-  for (; out_x_end >= 0; out_x_end--) {
-    int in_x = (out_x_end * stride_width) - pad_width;
-    int in_x_end = in_x + filter_width + (num_x_outputs - 1) * stride_width;
-    if (in_x_end <= input_width) {
-      out_x_end++;
-      break;
-    }
-  }
-  int out_y_start = 0;
-  for (; out_y_start < input_height; out_y_start++) {
-    int in_y = (out_y_start * stride_height) - pad_height;
-    if (in_y >= 0) {
-      break;
-    }
-  }
-  int out_y_end = output_height - 1;
-  for (; out_y_end >= 0; out_y_end--) {
-    int in_y = (out_y_end * stride_height) - pad_height;
-    int in_y_end = in_y + filter_height + (num_y_outputs - 1) * stride_height;
-    if (in_y_end <= input_height) {
-      out_y_end++;
-      break;
-    }
+  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run);
+  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run;
+  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run;
+  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run;
+  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run;
+
+  if (stride_width == 2) {
+    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run;
+    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run;
+    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run;
+    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run;
   }
 
-  using dot_product_func_t =
-      decltype(&ConvKernel3x3FilterDepth16<1, 2, 1>::Run);
-  dot_product_func_t dot_product_func = nullptr;
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array used
+  // in gemmlowp.
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
 
-  if (stride_width == 1 && stride_height == 1) {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 1>::Run;
-  } else {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 2>::Run;
-  }
+  // Make sure the kernels using this buffer will not run out of bounds.
+  static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
+  static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
 
-  // Offsets for preloading inputs.
-  const int i0 = 0;
-  const int i1 = input_depth;
-  const int i2 = 2 * input_depth;
-  const int i3 = input_row_width;
-  const int i4 = input_row_width + input_depth;
-  const int i5 = input_row_width + 2 * input_depth;
-  const int i6 = 2 * input_row_width;
-  const int i7 = 2 * input_row_width + input_depth;
-  const int i8 = 2 * input_row_width + 2 * input_depth;
-  const int i9 = 3 * input_row_width;
-  const int i10 = 3 * input_row_width + input_depth;
-  const int i11 = 3 * input_row_width + 2 * input_depth;
-  const int i12 = 4 * input_row_width;
-  const int i13 = 4 * input_row_width + input_depth;
-  const int i14 = 4 * input_row_width + 2 * input_depth;
+#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
 
   for (int b = 0; b < batches; ++b) {
-    const int32* bias_ptr = bias_data;
-    const uint8* filter_ptr = filter_data;
-
-    const int in_batch_offset = b * input_batch_size;
-    const int out_batch_offset = b * output_batch_size;
-
-    int depth = 0;
-    for (; depth <= output_depth - 16; depth += 16) {
-      Filter3x3x16 filter =
-          LoadFilterDepth16(filter_ptr, filter_offset, output_depth);
-
-      // Handle 1x2 outputs.
-      int out_y = out_y_start;
-      for (; out_y < out_y_end; out_y += num_y_outputs) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        // Preload inputs. If input depth is large, preload every value of the
-        // input for this depth range. Otherwise, preload only the first values
-        // of each row.
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-          preload_l1_keep(input_ptr + i8);
-          preload_l1_keep(input_ptr + i9);
-          preload_l1_keep(input_ptr + i10);
-          preload_l1_keep(input_ptr + i11);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i9);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-          }
-        }
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
 
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
-
-        for (; out_x < out_x_end; out_x += num_x_outputs) {
-          dot_product_func(filter, input_ptr, input_depth, input_offset,
-                           input_row_width, bias_ptr, output_offset,
-                           output_multiplier, output_shift,
-                           output_activation_min, output_activation_max,
-                           output_ptr, output_depth, output_width);
-
-          input_ptr += input_ptr_x_increment * num_x_outputs;
-          output_ptr += output_depth * num_x_outputs;
-
-          // Preload the next inputs depending on stride.
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i11);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i10);
-            preload_l1_keep(input_ptr + i11);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        }
+    int out_y = 0;
 
-        // Handle the rest of the right side.
-        for (; out_x < output_width; out_x++) {
-          // This code path can only be reached if we're handling >1 x outputs
-          // at a time or support kSame padding.
-        }
-      }
+    // Handle 8 rows at a time.
+    for (; out_y <= output_height - 8; out_y += 8) {
+      conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
 
-      // Handle the rest of the bottom side.
-      for (; out_y < output_height; out_y++) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-        }
+      input_ptr += 8 * stride_height * input_row_size;
+      output_ptr += 8 * output_row_size;
+    }
 
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
+    // Handle 4 rows at a time.
+    for (; out_y <= output_height - 4; out_y += 4) {
+      conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
 
-        for (; out_x < output_width; out_x++) {
-          ConvKernel3x3FilterDepth16<1, 1>::Run(
-              filter, input_ptr, input_depth, input_offset, input_row_width,
-              bias_ptr, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_ptr,
-              output_depth, output_width);
+      input_ptr += 4 * stride_height * input_row_size;
+      output_ptr += 4 * output_row_size;
+    }
 
-          input_ptr += input_ptr_x_increment;
-          output_ptr += output_depth;
-
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-          }
-        }
-      }
-      filter_ptr += 16;
-      bias_ptr += 16;
+    // Handle 2 rows at a time.
+    for (; out_y <= output_height - 2; out_y += 2) {
+      conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 2 * stride_height * input_row_size;
+      output_ptr += 2 * output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < output_height; out_y++) {
+      conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
+                        input_height, input_row_size, input_offset, filter_data,
+                        filter_offset, bias_data, output_offset,
+                        output_multiplier, output_shift, output_activation_min,
+                        output_activation_max, output_ptr, output_depth,
+                        output_width, shuffle_workspace);
+
+      input_ptr += stride_height * input_row_size;
+      output_ptr += output_row_size;
     }
   }
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 9a274612ad65df0b48b0096d2471ab732f33541b..9ac48ebdc99abae910131fa17a0c5369a024c464 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -554,88 +555,261 @@ inline void GEMVForLstmCellWithSymmetricRange(
   // exercises it). We just guard our assumptions about size evenness with
   // the following assertions.
   TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 8));
+  TFLITE_DCHECK(!(input_size % 64));
   const int32* bias_ptr = bias_data;
   int16* output_ptr = output_data;
   const uint8x16_t signbit = vdupq_n_u8(0x80);
   for (int in = 0; in < input_size; in += 32) {
     optimized_ops_preload_l1_keep(input_data + in);
   }
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
   for (int out = 0; out < output_size; out += 4) {
-    const uint8* weights_ptr_0 = weights_data + out * input_size;
-    const uint8* weights_ptr_1 = weights_ptr_0 + 1 * input_size;
-    const uint8* weights_ptr_2 = weights_ptr_0 + 2 * input_size;
-    const uint8* weights_ptr_3 = weights_ptr_0 + 3 * input_size;
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
 
-    int32x4_t acc_0 = vdupq_n_s32(0);
-    int32x4_t acc_1 = vdupq_n_s32(0);
-    int32x4_t acc_2 = vdupq_n_s32(0);
-    int32x4_t acc_3 = vdupq_n_s32(0);
-    int in = 0;
-    const int kReadAhead = 256;
-    // Handle 16 levels of depth at a time.
-    for (; in < input_size; in += 16) {
-      int8x16_t weights_val_0 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_0)));
-      int8x16_t weights_val_1 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_1)));
-      int8x16_t weights_val_2 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_2)));
-      int8x16_t weights_val_3 =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(weights_ptr_3)));
-      int8x16_t input_val =
-          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + in)));
-      int16x8_t acc16_0 =
-          vmull_s8(vget_low_s8(weights_val_0), vget_low_s8(input_val));
-      int16x8_t acc16_1 =
-          vmull_s8(vget_low_s8(weights_val_1), vget_low_s8(input_val));
-      int16x8_t acc16_2 =
-          vmull_s8(vget_low_s8(weights_val_2), vget_low_s8(input_val));
-      int16x8_t acc16_3 =
-          vmull_s8(vget_low_s8(weights_val_3), vget_low_s8(input_val));
-      acc16_0 = vmlal_s8(acc16_0, vget_high_s8(weights_val_0),
-                         vget_high_s8(input_val));
-      acc16_1 = vmlal_s8(acc16_1, vget_high_s8(weights_val_1),
-                         vget_high_s8(input_val));
-      acc16_2 = vmlal_s8(acc16_2, vget_high_s8(weights_val_2),
-                         vget_high_s8(input_val));
-      acc16_3 = vmlal_s8(acc16_3, vget_high_s8(weights_val_3),
-                         vget_high_s8(input_val));
-      acc_0 = vpadalq_s16(acc_0, acc16_0);
-      acc_1 = vpadalq_s16(acc_1, acc16_1);
-      acc_2 = vpadalq_s16(acc_2, acc16_2);
-      acc_3 = vpadalq_s16(acc_3, acc16_3);
-      weights_ptr_0 += 16;
-      weights_ptr_1 += 16;
-      weights_ptr_2 += 16;
-      weights_ptr_3 += 16;
-      optimized_ops_preload_l1_stream(weights_ptr_0 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_1 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_2 + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_ptr_3 + kReadAhead);
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
     }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
     // Horizontally reduce accumulators
     int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
         pairwise_reduced_acc_2, pairwise_reduced_acc_3;
     pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
     pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
     pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
     pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
     const int32x2_t reduced_lo =
         vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
         vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
     reduced = vaddq_s32(reduced, bias_vec);
-    int left_shift = accum_shift > 0 ? accum_shift : 0;
-    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
     reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
     // Multiply by the fixed-point multiplier.
     reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
@@ -962,7 +1136,7 @@ inline void FullyConnected(
 #ifdef GEMMLOWP_NEON
   if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
       output_activation_max == 32767) {
-    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 16)) {
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
       GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data,
                                         filter_dims, bias_data_int32, bias_dims,
                                         output_multiplier, -output_shift,
@@ -1029,6 +1203,142 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                  output_activation_max, output_data, output_dims, gemm_context);
 }
 
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label(
+      "ExperimentalShuffledFullyConnected/8bit");
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_EQ(output_activation_min, -32768);
+  TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  // The experimental shuffling is an optimization for matrix*vector product.
+  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
+  // batches>1.
+  TFLITE_DCHECK_EQ(batches, 1);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* shuffled_weights_ptr =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+#if defined USE_NEON
+  // We'll only need to xor signbit to the input activation values, as
+  // that xor-ing is pre-built into the shuffled weights values.
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  const int right_shift = output_shift > 0 ? output_shift : 0;
+  const int left_shift = output_shift > 0 ? 0 : -output_shift;
+  for (int c = 0; c < output_depth; c += 4) {
+    // Accumulation loop.
+    int32x4_t row_accum0 = vdupq_n_s32(0);
+    int32x4_t row_accum1 = vdupq_n_s32(0);
+    int32x4_t row_accum2 = vdupq_n_s32(0);
+    int32x4_t row_accum3 = vdupq_n_s32(0);
+    for (int d = 0; d < accum_depth; d += 16) {
+      int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
+      int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
+      int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
+      int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
+      shuffled_weights_ptr += 64;
+      int8x16_t input =
+          vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d)));
+      int16x8_t local_accum0 =
+          vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
+      int16x8_t local_accum1 =
+          vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
+      int16x8_t local_accum2 =
+          vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
+      int16x8_t local_accum3 =
+          vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
+      local_accum0 =
+          vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
+      local_accum1 =
+          vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
+      local_accum2 =
+          vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
+      local_accum3 =
+          vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
+      row_accum0 = vpadalq_s16(row_accum0, local_accum0);
+      row_accum1 = vpadalq_s16(row_accum1, local_accum1);
+      row_accum2 = vpadalq_s16(row_accum2, local_accum2);
+      row_accum3 = vpadalq_s16(row_accum3, local_accum3);
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + c);
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_data + c, res16);
+  }
+#else
+  for (int c = 0; c < output_depth; c += 4) {
+    // Internal accumulation.
+    // Initialize accumulator with the bias-value.
+    int32 accum[4] = {0};
+    // Accumulation loop.
+    for (int d = 0; d < accum_depth; d += 16) {
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 16; j++) {
+          int8 input_val = input_data[d + j] - 128;
+          int8 weights_val = *shuffled_weights_ptr++;
+          accum[i] += weights_val * input_val;
+        }
+      }
+    }
+    for (int i = 0; i < 4; i++) {
+      // Add bias value
+      int acc = accum[i] + bias_data[c + i];
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      acc =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
+      // Saturate, cast to int16, and store to output array.
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[c + i] = acc;
+    }
+  }
+#endif
+}
+
 template <typename T>
 inline void ExtractPatchIntoBufferColumn(
     const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
@@ -3764,7 +4074,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3962,6 +4272,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 // optimized yet.
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3995,6 +4306,94 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Currently just a copy of the reference code.
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax/Uint8");
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
@@ -4008,7 +4407,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Logistic");
+  gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
   /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
   /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
   /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -4488,66 +4887,23 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("FakeQuant");
 
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
 
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
@@ -4558,11 +4914,12 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
           const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
+          const float clamped =
+              std::min(nudged_max, std::max(nudged_min, src_val));
+          const float clamped_shifted = clamped - nudged_min;
+          const float dst_val =
+              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+              nudged_min;
           output_data[Offset(output_dims, c, x, y, b)] = dst_val;
         }
       }
@@ -4991,6 +5348,7 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
                            const Dims<4>& output_dims) {
+  // Unoptimized - Straight copy from reference ops.
   gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
@@ -5032,29 +5390,76 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                          int input_dim, int output_dim, int* start_index,
+                          int* end_index) {
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index =
+      std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index = std::min(
+      input_dim,
+      (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
 
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
-    for (int in_h = 0; in_h < input_height; ++in_h) {
-      for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top,
+                  block_shape_height, input_height, output_height, &in_h_start,
+                  &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      TFLITE_DCHECK_GE(out_h, 0);
+      TFLITE_DCHECK_LT(out_h, output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left,
+                    block_shape_width, input_width, output_width, &in_w_start,
+                    &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w) {
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+        TFLITE_DCHECK_GE(out_w, 0);
+        TFLITE_DCHECK_LT(out_w, output_width);
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
@@ -5067,7 +5472,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
@@ -5087,27 +5492,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, 0,
+    memset(output_data, pad_value,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
-                 left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+                 pad_value, left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -5121,20 +5526,21 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              0, right_d_padding * sizeof(T));
+              pad_value, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            0, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             0, right_h_padding * output_width * output_depth * sizeof(T));
+             pad_value,
+             right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -5146,6 +5552,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 18be6777a5caeb45a4ffabd8b7f1793de7b053f8..b0951aac8cbb98a181d9dcaef88770fadfc74f62 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -78,6 +78,22 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
                                    quantized_multiplier, left_shift);
 }
 
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift) {
+  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
+                           quantized_multiplier, left_shift);
+
+  // Also calculate what amounts to the inverse scaling factor for the input.
+  const double real_reverse_scaling_divisor =
+      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
+  tflite::QuantizeMultiplierSmallerThanOne(real_reverse_scaling_divisor,
+                                           reverse_scaling_divisor,
+                                           reverse_scaling_right_shift);
+}
+
 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
                                     (1ll << (31 - input_integer_bits)) /
@@ -88,4 +104,25 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   return static_cast<int>(std::floor(max_input_rescaled));
 }
 
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* scale) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  uint16 nudged_zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    nudged_zero_point = static_cast<uint16>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    nudged_zero_point = static_cast<uint16>(quant_max);
+  } else {
+    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+  }
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index 9a04b76e56b2527b06f5b0ec1e75e991fd1cbdea..4a217515f142b2451ebd61e423871b95cdc09748 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -196,7 +196,12 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
 void PreprocessSoftmaxScaling(double beta, double input_scale,
                               int input_integer_bits,
                               int32_t* quantized_multiplier, int* left_shift);
-
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift);
 // Calculate the largest input that will result in a within-bounds intermediate
 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
 // it must not overflow before we reduce the value by multiplication by the
@@ -204,6 +209,14 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
 // Softmax.
 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
 
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max, float* scale);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 31e190e2480f6af389a65b66a5c65374e1e42ee1..b11489e4af0d4658ee2c44da55e379deee1fbb6e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -601,6 +602,67 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void ExperimentalShuffledFullyConnected(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    int16* output_data, const Dims<4>& output_dims,
+    gemmlowp::GemmContext* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  // The experimental shuffling is an optimization for matrix*vector product.
+  // We aren't interested in supporting non-matrix*vector-product cases, i.e.
+  // batches>1.
+  TFLITE_DCHECK_EQ(batches, 1);
+  // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+  // so that just reinterpreting them as int8 values is equivalent to
+  // subtracting 128 from them, thus implementing for free the subtraction of
+  // the zero_point value 128.
+  const int8* shuffled_weights_ptr =
+      reinterpret_cast<const int8*>(shuffled_weights_data);
+  for (int c = 0; c < output_depth; c += 4) {
+    // Internal accumulation.
+    // Initialize accumulator with the bias-value.
+    int32 accum[4] = {0};
+    // Accumulation loop.
+    for (int d = 0; d < accum_depth; d += 16) {
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 16; j++) {
+          int8 input_val = input_data[d + j] - 128;
+          int8 weights_val = *shuffled_weights_ptr++;
+          accum[i] += weights_val * input_val;
+        }
+      }
+    }
+    for (int i = 0; i < 4; i++) {
+      // Add bias value
+      int acc = accum[i] + bias_data[c + i];
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      acc =
+          MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift);
+      // Saturate, cast to int16, and store to output array.
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[c + i] = acc;
+    }
+  }
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
@@ -635,27 +697,14 @@ void NonGlobalBatchNormalization(
     const Dims<4>& offset_dims, float* output_data,
     const Dims<4>& output_dims) {
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      output_data[b * inner_size + i] = ActivationFunction<Ac>(
+          (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] +
+          offset_data[i]);
     }
   }
 }
@@ -669,87 +718,52 @@ void GlobalBatchNormalization(const float* input_data,
                               const float* offset_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = ActivationFunction<Ac>(
+          (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] +
+          offset_data[c]);
     }
   }
 }
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float lower = 0;
-          float clamped = val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float lower = 0;
+    const float clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -757,24 +771,17 @@ template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float l2_norm = std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
     }
   }
 }
@@ -859,26 +866,11 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] +
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1141,26 +1133,11 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] *
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1339,6 +1316,33 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1380,57 +1384,15 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1812,15 +1774,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   (void)gemm_context;  // only used in optimized code.
 
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -1836,9 +1792,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -1883,7 +1837,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   for (int b = 0; b < outer_size; ++b) {
     for (int c = 0; c < output_depth; ++c) {
       // Define the fixed-point data types that we will use here. All use
@@ -2418,28 +2371,20 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float bias, float alpha, float beta,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const int begin_input_c = std::max(0, c - range);
-          const int end_input_c = std::min(depth, c + range);
-          float accum = 0.f;
-          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-            const float input_val =
-                input_data[Offset(input_dims, input_c, x, y, b)];
-            accum += input_val * input_val;
-          }
-          const float multiplier = std::pow(bias + alpha * accum, -beta);
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      const int begin_input_c = std::max(0, c - range);
+      const int end_input_c = std::min(depth, c + range);
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
       }
+      const float multiplier = std::pow(bias + alpha * accum, -beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
     }
   }
 }
@@ -2447,37 +2392,28 @@ inline void LocalResponseNormalization(const float* input_data,
 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                          beta);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp((input_data[i * depth + c] - max) * beta);
+    }
 
-        // Compute result.
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                       beta) /
-              sum;
-        }
-      }
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * beta) / sum;
     }
   }
 }
@@ -2498,73 +2434,63 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int x = 0; x < width; ++x) {
-      for (int y = 0; y < height; ++y) {
-        uint8 max_in_row = 0;
-        for (int c = 0; c < depth; ++c) {
-          max_in_row =
-              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
-        }
-
-        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-            sum_of_exps =
-                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                  exp_on_negative_values(scaled_diff_f8));
-          }
-        }
-
-        int32 fixed_sum_of_exps = sum_of_exps.raw();
-        int headroom_plus_one =
-            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-        // This is the number of bits to the left of the binary point above 1.0.
-        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-        // no later adjustment will be needed.
-        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-        int32 shifted_sum_minus_one = static_cast<int32>(
-            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-            (static_cast<uint32>(1) << 31));
-
-        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-            FixedPoint0::FromRaw(shifted_sum_minus_one));
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int32 fixed_sum_of_exps = sum_of_exps.raw();
+    int headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+    // This is the number of bits to the left of the binary point above 1.0.
+    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+    // no later adjustment will be needed.
+    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+    int32 shifted_sum_minus_one = static_cast<int32>(
+        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+        (static_cast<uint32>(1) << 31));
+
+    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+        FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
 
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
-            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
-                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
-
-          } else {
-            output_data[Offset(output_dims, c, x, y, b)] = 0;
-          }
-        }
+      } else {
+        output_data[i * depth + c] = 0;
       }
     }
   }
@@ -2572,109 +2498,170 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
 
-        // Compute result.
-        const float log_sum = std::log(sum);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
-        }
-      }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
     }
   }
 }
 
-inline void Logistic(const float* input_data, const Dims<4>& input_dims,
-                     float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = 1.f / (1.f + std::exp(-val));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
       }
     }
   }
 }
 
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = 1.f / (1.f + std::exp(-val));
+    output_data[i] = result;
+  }
+}
+
 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
-            // Convert from Q0.31 to Q23.8.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as U0.8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      // Convert from Q0.31 to Q23.8.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as U0.8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -2692,20 +2679,12 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = std::tanh(val);
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
   }
 }
 
@@ -2714,47 +2693,38 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
   const int32 output_zero_point = 128;
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-            // Convert from Q0.31 to Q24.7.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            output_val_s32 += output_zero_point;
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as Q0.7, encoded in uint8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as Q0.7, encoded in uint8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
@@ -2766,8 +2736,7 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_GE(input_left_shift, 0);
   TFLITE_DCHECK_LE(input_left_shift, 1);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   // F0 uses 0 integer bits, range [-1, 1].
   // This is the return type of math functions such as tanh, logistic,
@@ -2795,138 +2764,62 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
-
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
+
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; i++) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
   }
 }
 
 template <typename SrcT, typename DstT>
 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
                  DstT* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = static_cast<DstT>(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = static_cast<DstT>(input_data[offset]);
   }
 }
 
 inline void Floor(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = std::floor(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
   }
 }
 
@@ -3056,24 +2949,37 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
   const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
   const int input_batch_size = ArraySize(input_dims, 3);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int depth = ArraySize(input_dims, 0);
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int crops_top = crops_data[0];
+  const int crops_left = crops_data[2];
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
     for (int in_h = 0; in_h < input_height; ++in_h) {
+      const int out_h = in_h * block_shape_height +
+                        spatial_offset / block_shape_width - crops_top;
+      if (out_h < 0 || out_h >= output_height) {
+        continue;
+      }
       for (int in_w = 0; in_w < input_width; ++in_w) {
-        int out_batch = in_batch % output_batch_size;
-        int out_w = in_w * block_shape_width +
-                    (in_batch / output_batch_size) % block_shape_width;
-        int out_h = in_h * block_shape_height +
-                    (in_batch / output_batch_size) / block_shape_width;
+        const int out_w = in_w * block_shape_width +
+                          spatial_offset % block_shape_width - crops_left;
+
+        if (out_w < 0 || out_w >= output_width) {
+          continue;
+        }
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
         const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
         memcpy(out, in, depth * sizeof(T));
@@ -3086,7 +2992,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3116,7 +3022,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = 0;
+            *out_ptr++ = static_cast<T>(pad_value);
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3126,6 +3032,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 inline bool LoopCondition(int index, int stop, int stride) {
   return stride > 0 ? index < stop : index > stop;
 }
@@ -3375,23 +3290,11 @@ template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto min_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] > min_value ? min_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
   }
 }
 
@@ -3399,30 +3302,19 @@ template <typename T>
 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto max_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] < max_value ? max_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
   }
 }
 
-template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       T* output_data, const Dims<4>& output_dims) {
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
@@ -3436,7 +3328,7 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
           auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
           auto in1_val = input1_data[in1_idx];
           auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = in1_val > in2_val ? in1_val : in2_val;
+          output_data[out_idx] = op(in1_val, in2_val);
         }
       }
     }
@@ -3455,25 +3347,20 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = input_data[i * depth];
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = input_data[i * depth + d];
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
     }
+    output_data[i] = max_index;
   }
 }
 
@@ -3523,11 +3410,11 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 
   // Although transpose convolution simplifies to convolution with transposed
   // weights for strides of 1, non-unitary striding complicates matters. To
-  // keep this reference implementation as clear as possible, we use a "scatter"
-  // access pattern, where we loop through all the input elements, computing
-  // their influence on the output, rather than looping through the output
-  // elements in the typical "gather" access pattern of a conv. We therefore
-  // must initialize the output array to zero.
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
   for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
     output_data[i] = 0.0f;
   }
@@ -3567,6 +3454,51 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Less(int64_t num_elements, const T* input1, const T* input2,
+                 bool* output) {
+  for (int64_t i = 0; i < num_elements; ++i) {
+    output[i] = input1[i] < input2[i];
+  }
+}
+
+template <typename T>
+inline void Less(const T* input1_data, const Dims<4>& input1_dims,
+                 const T* input2_data, const Dims<4>& input2_dims,
+                 bool* output_data, const Dims<4>& output_dims) {
+  const int64_t batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int64_t height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int64_t width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int64_t depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  Less(batches * height * width * depth, input1_data, input2_data, output_data);
+}
+
+template <typename T1, typename T2>
+inline void BroadcastLess(T1* input1_data, const Dims<4>& input1_dims,
+                          T2* input2_data, const Dims<4>& input2_dims,
+                          bool* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastLess");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] <
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 4bce2ffaaf326cf083a76c76adb093f3ac2e8850..62cea143e6afc0631493012be26808a89eb03138 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -44,6 +44,11 @@ inline int64_t* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i64 : nullptr;
 }
 
+template <>
+inline bool* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.b : nullptr;
+}
+
 inline int RemapDim(int max_dimensions, int d) {
   return max_dimensions - d - 1;
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 293538fcbb6406d6065d8efd25adb3b163638c92..3290c364c18224edb733c177ad72bf86b6892434 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -130,14 +130,125 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
   return MatchingArraySize(array1, index1, args...);
 }
 
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
   int max_offset = 0;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < N; i++) {
     max_offset += (dims.sizes[i] - 1) * dims.strides[i];
   }
   return max_offset + 1;
 }
 
+// Deprecated. Prefer FlatSize.
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; i++) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
 template <int N>
 bool IsPackedWithoutStrides(const Dims<N>& dims) {
   int expected_stride = 1;
diff --git a/tensorflow/contrib/lite/kernels/maximum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
similarity index 59%
rename from tensorflow/contrib/lite/kernels/maximum.cc
rename to tensorflow/contrib/lite/kernels/maximum_minimum.cc
index 13c40603ced6338086809b908539156e2c0985e7..5a28d663c9e756040746f0a98b356afba76cceab 100644
--- a/tensorflow/contrib/lite/kernels/maximum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -24,9 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace ops {
 namespace builtin {
-namespace maximum {
+namespace maximum_minimum {
 
-// This file has a reference implemenation of TFMaximum.
+// This file has a reference implemenation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
 };
@@ -35,8 +35,8 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-struct MaximumContext {
-  MaximumContext(TfLiteContext* context, TfLiteNode* node) {
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
     input1 = GetInput(context, node, kInputTensor1);
     input2 = GetInput(context, node, kInputTensor2);
     output = GetOutput(context, node, kOutputTensor);
@@ -50,7 +50,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  MaximumContext op_context(context, node);
+  OpContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
   op_context.output->type = op_context.input1->type;
 
@@ -69,23 +69,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, op_context.output, output_size);
 }
 
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  MaximumContext op_context(context, node);
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                      const OpContext& op_context) {
+  reference_ops::TensorFlowMaximumMinimum<data_type>(
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorDims(op_context.input1),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorDims(op_context.input2),
+      GetTensorData<data_type>(op_context.output),
+      GetTensorDims(op_context.output), op_type::template op<data_type>);
+}
 
-#define TF_LITE_MAXIMUM(kernel_type, data_type)    \
-  kernel_type::TensorFlowMaximum<data_type>(       \
-      GetTensorData<data_type>(op_context.input1), \
-      GetTensorDims(op_context.input1),            \
-      GetTensorData<data_type>(op_context.input2), \
-      GetTensorDims(op_context.input2),            \
-      GetTensorData<data_type>(op_context.output), \
-      GetTensorDims(op_context.output))
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
 
   if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TF_LITE_MAXIMUM(reference_ops, float);
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+       TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
         break;
       default:
         context->ReportError(context,
@@ -99,19 +125,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                          op_context.output->type);
     return kTfLiteError;
   }
-#undef TF_LITE_MAXIMUM
   return kTfLiteOk;
 }
 
-}  // namespace maximum
+}  // namespace maximum_minimum
 
 TfLiteRegistration* Register_MAXIMUM_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, maximum::Prepare,
-                                 maximum::Eval<maximum::kReference>};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
   return &r;
 }
 
+TfLiteRegistration* Register_MINIMUM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
 TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0752aa1804722accb1f88910fe013ffd632a4503
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MaxMinOpModel : public SingleOpModel {
+ public:
+  MaxMinOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorType& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor(input2_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values,
+               std::initializer_list<data_type> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<data_type>(input1_values);
+  m.SetInput2<data_type>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<data_type>(), ElementsAreArray(output_values));
+}
+
+template <>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<float> input1_values,
+               std::initializer_list<float> input2_values,
+               std::initializer_list<float> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<float>(input1_values);
+  m.SetInput2<float>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(output_values)));
+}
+
+TEST(MaximumOpTest, FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {1.0, 0.0, 1.0, 12.0, -2.0, -1.43});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44});
+}
+
+TEST(MaxMinOpTest, Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {1, 0, 2, 12, 255, 23});
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {0, 0, 1, 11, 2, 1});
+}
+
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {1.0, 2.0, 0.5, 2.0, 0.5, 11.0});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {0.5, 0.0, -1.0, -2.0, -1.44, 2.0});
+}
+
+TEST(MaximumOpTest, Int32WithBroadcastTest) {
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  TestModel<int32>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {2, 2, 2, 2, 3, 11});
+  TestModel<int32>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {1, 0, -1, -2, 2, 2});
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/maximum_test.cc b/tensorflow/contrib/lite/kernels/maximum_test.cc
deleted file mode 100644
index df2bf29c205e0a3ff6ea5df2bba8ca721a09e626..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/maximum_test.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class MaximumOpModel : public SingleOpModel {
- public:
-  MaximumOpModel(const TensorData& input1, const TensorData& input2,
-                 const TensorType& output) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(output);
-    SetBuiltinOp(BuiltinOperator_MAXIMUM, BuiltinOptions_MaximumOptions,
-                 CreateMaximumOptions(builder_).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
-  }
-
-  template <class T>
-  void SetInput1(std::initializer_list<T> data) {
-    PopulateTensor(input1_, data);
-  }
-
-  template <class T>
-  void SetInput2(std::initializer_list<T> data) {
-    PopulateTensor(input2_, data);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input1_;
-  int input2_;
-  int output_;
-};
-
-TEST(MaximumOpTest, FloatTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
-  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}},
-                   {TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
-  m.SetInput1<float>(data1);
-  m.SetInput2<float>(data2);
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(
-      m.GetOutput<float>(),
-      ElementsAreArray(ArrayFloatNear({1.0, 0.0, 1.0, 12.0, -2.0, -1.43})));
-}
-
-TEST(MaximumOpTest, FloatWithBroadcastTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
-  std::initializer_list<float> data2 = {0.5, 2.0};
-  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}}, {TensorType_FLOAT32, {2}},
-                   TensorType_FLOAT32);
-  m.SetInput1<float>(data1);
-  m.SetInput2<float>(data2);
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(
-      m.GetOutput<float>(),
-      ElementsAreArray(ArrayFloatNear({1.0, 2.0, 0.5, 2.0, 0.5, 11.0})));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index c29da3862e84d6756bf5ef34b2ca06307b0a065d..4f9449a225c66a0fb2a9285e6aff3a1f7147f5dd 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,39 +119,46 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar)                                           \
+#define TF_LITE_PAD(type, scalar, pad_value)                                \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output))
+            GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float);
+        TF_LITE_PAD(reference_ops, float, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float);
+        TF_LITE_PAD(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
+      // Quantized Pad requires that 0 is represented in the quantized range.
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                  std::numeric_limits<uint8_t>::min());
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t);
+        TF_LITE_PAD(reference_ops, uint8_t,
+                    op_context.output->params.zero_point);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t);
+        TF_LITE_PAD(optimized_ops, uint8_t,
+                    op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t);
+        TF_LITE_PAD(reference_ops, int32_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t);
+        TF_LITE_PAD(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t);
+        TF_LITE_PAD(reference_ops, int64_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t);
+        TF_LITE_PAD(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index 28834ad0719291b2e868bca2d86a6685e6eb9962..c06237e5720874e66c5953edab2d3749cc88af28 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -29,6 +30,10 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -36,6 +41,11 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int output_;
@@ -50,16 +60,17 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(std::initializer_list<int> input_shape,
+  PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -72,40 +83,45 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(std::initializer_list<int> input_shape,
-                    std::initializer_list<int> paddings_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  PadOpDynamicModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape, paddings_shape});
+    BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                      {TensorType_FLOAT32}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -114,7 +130,8 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -124,7 +141,8 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -134,7 +152,8 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -144,6 +163,80 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+class QuantizedPadOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index 40b8476b3779c66e31a04856bce8aebd378f1e5f..e81b970e0fb149e8c5d95ed12622917fdc336f7a 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -17,9 +17,10 @@ limitations under the License.
 
 namespace tflite {
 
-inline int ComputePadding(int stride, int in_size, int filter_size,
-                          int out_size) {
-  int padding = ((out_size - 1) * stride + filter_size - in_size) / 2;
+inline int ComputePadding(int stride, int dilation_rate, int in_size,
+                          int filter_size, int out_size) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
   return padding > 0 ? padding : 0;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index b79880110897a1438a589d97363fd861c61667e7..0bf27c34c1337b4ae4b8b73ee2dafcc931c7ce3c 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -94,9 +94,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   int outHeight =
       computeOutSize(height, params->filter_height, params->stride_height);
 
-  data->padding.height = ComputePadding(params->stride_height, height,
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
                                         params->filter_height, outHeight);
-  data->padding.width = ComputePadding(params->stride_width, width,
+  data->padding.width = ComputePadding(params->stride_width, 1, width,
                                        params->filter_width, outWidth);
 
   if (input->type == kTfLiteUInt8) {
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 0f98154b904b1f776016e6bbee3263027f815244..b07e7b6ff32e9ea513e60619078e09b7e6d1db72 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -77,6 +77,9 @@ TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
 TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_LESS();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -135,6 +138,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 66b06aeaec52dd3d2d98acfec8218ffdd0ae6bf3..5acb3561817f2989d2db7fd0b0bf2dac5a100389 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -174,7 +174,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Inputs and outputs not all float|unit8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 3448de68e8e678e3b5316e82617a5b15a5e7b8b4..2dd6d67e078619df41524e8242a0475320c02013 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -57,6 +57,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_STRING:
       *type = kTfLiteString;
       break;
+    case TensorType_BOOL:
+      *type = kTfLiteBool;
+      break;
     default:
       error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
                              EnumNameTensorType(tensor_type), tensor_type);
@@ -261,13 +264,11 @@ T* MallocPOD() {
 // Parse the appropriate data out of the op.
 //
 // This handles builtin data explicitly as there are flatbuffer schemas.
-//
-// Returns memory that must be feed.
-//
-// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
-// crashes if error reporter is called.
-void* ParseOpData(const Operator* op, BuiltinOperator op_type,
-                  ErrorReporter* error_reporter) {
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`, which
+// need to be released by calling `free`.`
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter, void** builtin_data) {
   auto parse_padding = [](Padding padding) {
     switch (padding) {
       case Padding_SAME:
@@ -316,7 +317,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
   };
 
-  void* builtin_data = nullptr;
+  *builtin_data = nullptr;
   switch (op_type) {
     case BuiltinOperator_CALL:
       // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
@@ -332,8 +333,10 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->stride_height = conv_params->stride_h();
         params->activation =
             parse_activation(conv_params->fused_activation_function());
+        params->dilation_width_factor = conv_params->dilation_w_factor();
+        params->dilation_height_factor = conv_params->dilation_h_factor();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TANH:
@@ -358,10 +361,11 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             ConvertTensorType(schema_params->out_data_type(),
                               &params->out_data_type, error_reporter);
         if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
-          break;
+          free(params);
+          return kTfLiteError;
         }
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LSH_PROJECTION: {
@@ -370,7 +374,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* lshParams = op->builtin_options_as_LSHProjectionOptions()) {
         params->type = parseLSHProjectionType(lshParams->type());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_AVERAGE_POOL_2D:
@@ -386,7 +390,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(pool_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
@@ -400,7 +404,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SVDF: {
@@ -410,7 +414,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(svdf_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
@@ -422,7 +426,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(sequence_rnn_params->fused_activation_function());
         params->time_major = sequence_rnn_params->time_major();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RNN: {
@@ -431,7 +435,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(rnn_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_EMBEDDING_LOOKUP:
@@ -444,7 +448,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_EmbeddingLookupSparseOptions()) {
         params->combiner = parseCombinerType(embedding_params->combiner());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_FULLY_CONNECTED: {
@@ -455,7 +459,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation = parse_activation(
             fully_connected_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_HASHTABLE_LOOKUP:
@@ -466,7 +470,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* softmax_params = op->builtin_options_as_SoftmaxOptions()) {
         params->beta = softmax_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_CONCATENATION: {
@@ -478,7 +482,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(concatenation_params->fused_activation_function());
         params->axis = concatenation_params->axis();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MUL: {
@@ -487,7 +491,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_ADD: {
@@ -496,7 +500,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DIV: {
@@ -505,7 +509,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SUB: {
@@ -514,7 +518,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -523,7 +527,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
@@ -535,7 +539,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->alpha = schema_params->alpha();
         params->beta = schema_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
@@ -548,7 +552,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
@@ -557,7 +561,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_ResizeBilinearOptions()) {
         params->align_corners = schema_params->align_corners();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
@@ -571,7 +575,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->shape, error_reporter);
         params->num_dimensions = new_shape->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SKIP_GRAM: {
@@ -581,7 +585,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->max_skip_size = skip_gram_params->max_skip_size();
         params->include_all_ngrams = skip_gram_params->include_all_ngrams();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_DEPTH: {
@@ -589,7 +593,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SpaceToDepthOptions()) {
         params->block_size = schema_params->block_size();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_GATHER: {
@@ -599,7 +603,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->axis = gather_params->axis();
       }
 
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
@@ -616,7 +620,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
         params->keep_dims = schema_params->keep_dims();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPLIT: {
@@ -624,7 +628,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SplitOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SQUEEZE: {
@@ -635,7 +639,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->squeeze_dims, error_reporter);
         params->num_squeeze_dims = squeeze_dims->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_STRIDED_SLICE: {
@@ -647,19 +651,32 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->new_axis_mask = schema_params->new_axis_mask();
         params->shrink_axis_mask = schema_params->shrink_axis_mask();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM: {
+      break;
+    }
+    case BuiltinOperator_ARG_MAX: {
+      auto* params = MallocPOD<TfLiteArgMaxParams>();
+      if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
+        ConvertTensorType(schema_params->output_type(), &params->output_type,
+                          error_reporter);
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_MAXIMUM: {
+    case BuiltinOperator_LESS: {
       break;
     }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
-      break;
+      return kTfLiteError;
     }
   }
-  return builtin_data;
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -699,10 +716,13 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           reinterpret_cast<const char*>(op->custom_options()->data()),
           op->custom_options()->size(), nullptr, reg);
     } else {
+      void* builtin_data = nullptr;
+      TF_LITE_ENSURE_STATUS(
+          ParseOpData(op, op_type, error_reporter_, &builtin_data));
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0,
-          ParseOpData(op, op_type, error_reporter_), reg);
+          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
+          reg);
     }
   }
 
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index bc13444dc70f27e3360774e843985b6294be6996..eab82ea8ef23542cb3fc490d913313d4c757e466 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -351,6 +351,9 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_CAST:
       case tflite::BuiltinOperator_PRELU:
       case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_LESS:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index 1f762e6688d0cc2a91417b9d82201446e3060a6f..e1366639c78a4e90740aaf42a9ba5770ec65cb78 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -48,6 +48,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt64";
     case kTfLiteString:
       return "kTfLiteString";
+    case kTfLiteBool:
+      return "kTfLiteBool";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index e735062a7f2749c1e1c43e9c5f4971b3c7383387..926896d609d83aac3b875d33dfe3c4dc7ae89ccd 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -97,6 +97,19 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "create_custom_op",
+    srcs = ["create_custom_op.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "@absl_py//absl/flags",
+    ],
+)
+
 py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
@@ -106,8 +119,13 @@ py_test(
     deps = [
         ":convert_saved_model",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model",
     ],
 )
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index d87fbeb91cc3d2779c0ae01aff488f88bd340c1c..734e42d619bdb79de0306a94e304ce46065d14d4 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -25,21 +25,21 @@ from __future__ import print_function
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
-from tensorflow.python import estimator
 from tensorflow.python import keras
-from tensorflow.python import layers
-from tensorflow.python import losses
-from tensorflow.python import nn
-from tensorflow.python import saved_model
-from tensorflow.python import train
 from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib as estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training import training as train
 
 
 class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/lite/python/create_custom_op.py b/tensorflow/contrib/lite/python/create_custom_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..830f95358c455047db2cbad15cfed8c221e95dca
--- /dev/null
+++ b/tensorflow/contrib/lite/python/create_custom_op.py
@@ -0,0 +1,111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Replaces a subgraph of a TensorFlow GraphDef with a single node.
+
+In conjunction with TOCO's --allow_custom_op this script allows selected
+portions of a TensorFlow GraphDef to be executed by custom code.
+
+Example:
+
+bazel run tensorflow/contrib/lite/python:create_custom_op  -- \
+  --input_graph=/tmp/input.pb \
+  --output_graph=/tmp/output.pb \
+  --inputs=concat,concat_1 \
+  --outputs=detection_classes \
+  --op_definition='op:"PostProcessing" attr{key:"num" value:{i:10}}'
+
+The above will identify a subgraph starting at nodes 'concat' and 'concat_1',
+and ending at 'detection_classes'. All nodes in between will be removed and
+replaced by a new op called 'PostProcessing'.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import uuid as _uuid
+from absl import app
+from absl import flags
+from google.protobuf import text_format
+from tensorflow.contrib.framework.python.framework.graph_util import fuse_op
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.platform import gfile
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_graph", "", "Binary graphdef to load.")
+flags.DEFINE_string("output_graph", "", "Resulting binary graphdef.")
+
+flags.DEFINE_string("inputs", "",
+                    "Comma-separated list of inputs to the subgraph.")
+flags.DEFINE_string("outputs", "",
+                    "Comma-separated list of outputs of the subgraph.")
+flags.DEFINE_string("op_definition", "",
+                    "A text NodeDef defining the contents of the custom op.")
+
+
+def _read_graph_def(filename):
+  if not gfile.Exists(filename):
+    raise ValueError("Input graph file '" + filename + "' does not exist!")
+
+  graph_def = graph_pb2.GraphDef()
+  with gfile.FastGFile(filename, "rb") as f:
+    graph_def.ParseFromString(f.read())
+  return graph_def
+
+
+def _write_graph_def(graph_def, filename):
+  if not filename:
+    raise ValueError("Output graph file not specified")
+
+  with gfile.Open(filename, "wb") as f:
+    f.write(graph_def.SerializeToString())
+
+
+def _collapse_subgraph(graph_def, inputs, outputs, op_definition):
+  """Substitute a custom op for the subgraph delimited by inputs and outputs."""
+  name = _uuid.uuid1().hex
+  # We need a default type, but it can be changed using 'op_definition'.
+  default_type = types_pb2.DT_FLOAT
+  new_graph = fuse_op(
+      graph_def=graph_def,
+      input_nodes=inputs,
+      output_nodes=outputs,
+      output_dtypes=[default_type for _ in outputs],
+      output_quantized=False,
+      op_name=name,
+      op_type="CustomTfLiteOp")
+  node_def = node_def_pb2.NodeDef()
+  text_format.Parse(op_definition, node_def)
+  for node in new_graph.node:
+    if node.name == name:
+      node.MergeFrom(node_def)
+  return new_graph
+
+
+def main(argv):
+  del argv  # unused
+  graph = _read_graph_def(filename=flags.FLAGS.input_graph)
+  graph = _collapse_subgraph(
+      graph_def=graph,
+      inputs=flags.FLAGS.inputs.split(","),
+      outputs=flags.FLAGS.outputs.split(","),
+      op_definition=flags.FLAGS.op_definition)
+  _write_graph_def(graph_def=graph, filename=flags.FLAGS.output_graph)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index b8638007f7e49737726d9939a00e8cb1d6a41281..cb9c0d3121895595ffce91e254bea3f527714809 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -121,8 +121,8 @@ class Interpreter(object):
     Raises:
       ValueError: If the interpreter could not resize the input tensor.
     """
-    if not self.ResizeInputTensor.SetTensor(input_index, tensor_size):
-      raise ValueError('Failed to set input')
+    if not self._interpreter.ResizeInputTensor(input_index, tensor_size):
+      raise ValueError('Failed to resize input')
 
   def get_output_details(self):
     """Gets model output details.
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index cd2386f5263f24e1e034015ec6880e71f0608c7c..f802edf020db8a9d4e7bb890aadaae7e34e983a8 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -81,6 +81,9 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
+    interpreter.resize_tensor_input(input_details[0]['index'],
+                                    np.array(test_input.shape, dtype=np.int32))
+    interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 35ad226b78c906f0819afd5b029a1a0d438d69af..04fc098129854e168d68de3b308eabbcaa968ea8 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -72,6 +72,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT64;
     case kTfLiteString:
       return NPY_OBJECT;
+    case kTfLiteBool:
+      return NPY_BOOL;
     case kTfLiteNoType:
       return -1;
   }
@@ -90,6 +92,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteUInt8;
     case NPY_INT64:
       return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
     case NPY_OBJECT:
     case NPY_STRING:
     case NPY_UNICODE:
@@ -186,7 +190,7 @@ bool InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
   std::vector<int> dims(PyArray_SHAPE(array)[0]);
   memcpy(dims.data(), PyArray_BYTES(array), dims.size() * sizeof(int));
 
-  return interpreter_->ResizeInputTensor(i, dims);
+  return (interpreter_->ResizeInputTensor(i, dims) == kTfLiteOk);
 }
 
 std::string InterpreterWrapper::TensorName(int i) const {
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index ed6dd036f9fd9f39b74e902498d815793943924b..cf50f9d4d65cb7a36af8f82e2d29babbc9884d23 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -145,7 +145,8 @@ def toco_convert(input_data,
                  input_format=TENSORFLOW_GRAPHDEF,
                  output_format=TFLITE,
                  quantized_input_stats=None,
-                 drop_control_dependency=True):
+                 drop_control_dependency=True,
+                 allow_custom_ops=None):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -178,9 +179,12 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
+  toco.inference_type = inference_type
   toco.drop_control_dependency = drop_control_dependency
+  if allow_custom_ops is not None:
+    toco.allow_custom_ops = allow_custom_ops
+
   model = _model_flags_pb2.ModelFlags()
-  toco.inference_type = inference_type
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = FLOAT
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index 246ec85fe47e496e157a91ab4ff84f6b1eeab4a4..9717a4a1a496b888348514584888e62c4e3703b4 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -63,6 +63,9 @@ cc_test(
         "schema.fbs",
         "schema_v3.fbs",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         "//tensorflow/core:lib_platform",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index c63bfb28cc66494c3a42250632485c67861c5e9b..2b62c257d8410f9af1b250c9d108eba6737a9efe 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -33,6 +33,7 @@ enum TensorType : byte {
   UINT8 = 3,
   INT64 = 4,
   STRING = 5,
+  BOOL = 6,
 }
 
 // Parameters for converting a quantized tensor back to float. Given a
@@ -132,6 +133,9 @@ enum BuiltinOperator : byte {
   CAST = 53,
   PRELU = 54,
   MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
 }
 
 // Options for the builtin operators.
@@ -174,7 +178,9 @@ union BuiltinOptions {
   LogSoftmaxOptions,
   CastOptions,
   DequantizeOptions,
-  MaximumOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -193,6 +199,8 @@ table Conv2DOptions {
   stride_w:int;
   stride_h:int;
   fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
 }
 
 table Pool2DOptions {
@@ -388,7 +396,14 @@ table CastOptions {
 table DequantizeOptions {
 }
 
-table MaximumOptions {
+table MaximumMinimumOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table LessOptions {
 }
 
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 0735be5c8f1b1c8a87c3d47839ce54595d58af7d..0b9961d606d6095b0dc693df631e4efffcb0e35e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -145,8 +145,14 @@ struct CastOptionsT;
 struct DequantizeOptions;
 struct DequantizeOptionsT;
 
-struct MaximumOptions;
-struct MaximumOptionsT;
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsT;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsT;
+
+struct LessOptions;
+struct LessOptionsT;
 
 struct OperatorCode;
 struct OperatorCodeT;
@@ -170,18 +176,20 @@ enum TensorType {
   TensorType_UINT8 = 3,
   TensorType_INT64 = 4,
   TensorType_STRING = 5,
+  TensorType_BOOL = 6,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_STRING
+  TensorType_MAX = TensorType_BOOL
 };
 
-inline TensorType (&EnumValuesTensorType())[6] {
+inline TensorType (&EnumValuesTensorType())[7] {
   static TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
     TensorType_INT32,
     TensorType_UINT8,
     TensorType_INT64,
-    TensorType_STRING
+    TensorType_STRING,
+    TensorType_BOOL
   };
   return values;
 }
@@ -194,6 +202,7 @@ inline const char **EnumNamesTensorType() {
     "UINT8",
     "INT64",
     "STRING",
+    "BOOL",
     nullptr
   };
   return names;
@@ -259,11 +268,14 @@ enum BuiltinOperator {
   BuiltinOperator_CAST = 53,
   BuiltinOperator_PRELU = 54,
   BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
+  BuiltinOperator_LESS = 58,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_MAXIMUM
+  BuiltinOperator_MAX = BuiltinOperator_LESS
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[57] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -318,7 +330,10 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
     BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
     BuiltinOperator_CAST,
     BuiltinOperator_PRELU,
-    BuiltinOperator_MAXIMUM
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM,
+    BuiltinOperator_LESS
   };
   return values;
 }
@@ -381,6 +396,9 @@ inline const char **EnumNamesBuiltinOperator() {
     "CAST",
     "PRELU",
     "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
+    "LESS",
     nullptr
   };
   return names;
@@ -431,12 +449,14 @@ enum BuiltinOptions {
   BuiltinOptions_LogSoftmaxOptions = 36,
   BuiltinOptions_CastOptions = 37,
   BuiltinOptions_DequantizeOptions = 38,
-  BuiltinOptions_MaximumOptions = 39,
+  BuiltinOptions_MaximumMinimumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
+  BuiltinOptions_LessOptions = 41,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_MaximumOptions
+  BuiltinOptions_MAX = BuiltinOptions_LessOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[42] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -477,7 +497,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
     BuiltinOptions_LogSoftmaxOptions,
     BuiltinOptions_CastOptions,
     BuiltinOptions_DequantizeOptions,
-    BuiltinOptions_MaximumOptions
+    BuiltinOptions_MaximumMinimumOptions,
+    BuiltinOptions_ArgMaxOptions,
+    BuiltinOptions_LessOptions
   };
   return values;
 }
@@ -523,7 +545,9 @@ inline const char **EnumNamesBuiltinOptions() {
     "LogSoftmaxOptions",
     "CastOptions",
     "DequantizeOptions",
-    "MaximumOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
     nullptr
   };
   return names;
@@ -690,8 +714,16 @@ template<> struct BuiltinOptionsTraits<DequantizeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
 };
 
-template<> struct BuiltinOptionsTraits<MaximumOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_MaximumOptions;
+template<> struct BuiltinOptionsTraits<MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
 };
 
 struct BuiltinOptionsUnion {
@@ -1029,13 +1061,29 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_DequantizeOptions ?
       reinterpret_cast<const DequantizeOptionsT *>(value) : nullptr;
   }
-  MaximumOptionsT *AsMaximumOptions() {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<MaximumOptionsT *>(value) : nullptr;
+  MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  const MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<ArgMaxOptionsT *>(value) : nullptr;
   }
-  const MaximumOptionsT *AsMaximumOptions() const {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<const MaximumOptionsT *>(value) : nullptr;
+  const ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const ArgMaxOptionsT *>(value) : nullptr;
+  }
+  LessOptionsT *AsLessOptions() {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<LessOptionsT *>(value) : nullptr;
+  }
+  const LessOptionsT *AsLessOptions() const {
+    return type == BuiltinOptions_LessOptions ?
+      reinterpret_cast<const LessOptionsT *>(value) : nullptr;
   }
 };
 
@@ -1430,11 +1478,15 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
   int32_t stride_w;
   int32_t stride_h;
   ActivationFunctionType fused_activation_function;
+  int32_t dilation_w_factor;
+  int32_t dilation_h_factor;
   Conv2DOptionsT()
       : padding(Padding_SAME),
         stride_w(0),
         stride_h(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
+        fused_activation_function(ActivationFunctionType_NONE),
+        dilation_w_factor(0),
+        dilation_h_factor(0) {
   }
 };
 
@@ -1444,7 +1496,9 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
-    VT_FUSED_ACTIVATION_FUNCTION = 10
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14
   };
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -1458,12 +1512,20 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 0);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR) &&
            verifier.EndTable();
   }
   Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1486,6 +1548,12 @@ struct Conv2DOptionsBuilder {
   void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 0);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 0);
+  }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1503,8 +1571,12 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
     Padding padding = Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    int32_t dilation_w_factor = 0,
+    int32_t dilation_h_factor = 0) {
   Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -3806,45 +3878,139 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
 
 flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MaximumOptionsT : public flatbuffers::NativeTable {
-  typedef MaximumOptions TableType;
-  MaximumOptionsT() {
+struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+  MaximumMinimumOptionsT() {
   }
 };
 
-struct MaximumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef MaximumOptionsT NativeTableType;
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MaximumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MaximumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-struct MaximumOptionsBuilder {
+struct MaximumMinimumOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  explicit MaximumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  MaximumOptionsBuilder &operator=(const MaximumOptionsBuilder &);
-  flatbuffers::Offset<MaximumOptions> Finish() {
+  MaximumMinimumOptionsBuilder &operator=(const MaximumMinimumOptionsBuilder &);
+  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MaximumOptions>(end);
+    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
     flatbuffers::FlatBufferBuilder &_fbb) {
-  MaximumOptionsBuilder builder_(_fbb);
+  MaximumMinimumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  TensorType output_type;
+  ArgMaxOptionsT()
+      : output_type(TensorType_FLOAT32) {
+  }
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  enum {
+    VT_OUTPUT_TYPE = 4
+  };
+  TensorType output_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ArgMaxOptionsBuilder &operator=(const ArgMaxOptionsBuilder &);
+  flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType output_type = TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LessOptionsT : public flatbuffers::NativeTable {
+  typedef LessOptions TableType;
+  LessOptionsT() {
+  }
+};
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LessOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LessOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LessOptionsBuilder &operator=(const LessOptionsBuilder &);
+  flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
@@ -4077,8 +4243,14 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const DequantizeOptions *builtin_options_as_DequantizeOptions() const {
     return builtin_options_type() == BuiltinOptions_DequantizeOptions ? static_cast<const DequantizeOptions *>(builtin_options()) : nullptr;
   }
-  const MaximumOptions *builtin_options_as_MaximumOptions() const {
-    return builtin_options_type() == BuiltinOptions_MaximumOptions ? static_cast<const MaximumOptions *>(builtin_options()) : nullptr;
+  const MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == BuiltinOptions_MaximumMinimumOptions ? static_cast<const MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == BuiltinOptions_LessOptions ? static_cast<const LessOptions *>(builtin_options()) : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
@@ -4258,8 +4430,16 @@ template<> inline const DequantizeOptions *Operator::builtin_options_as<Dequanti
   return builtin_options_as_DequantizeOptions();
 }
 
-template<> inline const MaximumOptions *Operator::builtin_options_as<MaximumOptions>() const {
-  return builtin_options_as_MaximumOptions();
+template<> inline const MaximumMinimumOptions *Operator::builtin_options_as<MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
+template<> inline const LessOptions *Operator::builtin_options_as<LessOptions>() const {
+  return builtin_options_as_LessOptions();
 }
 
 struct OperatorBuilder {
@@ -4729,6 +4909,8 @@ inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resol
   { auto _e = stride_w(); _o->stride_w = _e; };
   { auto _e = stride_h(); _o->stride_h = _e; };
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = dilation_w_factor(); _o->dilation_w_factor = _e; };
+  { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; };
 }
 
 inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4743,12 +4925,16 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatB
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _dilation_w_factor = _o->dilation_w_factor;
+  auto _dilation_h_factor = _o->dilation_h_factor;
   return tflite::CreateConv2DOptions(
       _fbb,
       _padding,
       _stride_w,
       _stride_h,
-      _fused_activation_function);
+      _fused_activation_function,
+      _dilation_w_factor,
+      _dilation_h_factor);
 }
 
 inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5796,26 +5982,75 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffer
       _fbb);
 }
 
-inline MaximumOptionsT *MaximumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MaximumOptionsT();
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumMinimumOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MaximumOptions::UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MaximumOptions> MaximumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMaximumOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateMaximumOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
+      _fbb);
+}
+
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ArgMaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; };
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
+inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
       _fbb);
 }
 
@@ -6151,8 +6386,16 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
     default: return false;
@@ -6325,8 +6568,16 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return ptr->UnPack(resolver);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptions *>(obj);
       return ptr->UnPack(resolver);
     }
     default: return nullptr;
@@ -6487,9 +6738,17 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const DequantizeOptionsT *>(value);
       return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptionsT *>(value);
-      return CreateMaximumOptions(_fbb, ptr, _rehasher).Union();
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<const LessOptionsT *>(value);
+      return CreateLessOptions(_fbb, ptr, _rehasher).Union();
     }
     default: return 0;
   }
@@ -6649,8 +6908,16 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new DequantizeOptionsT(*reinterpret_cast<DequantizeOptionsT *>(u.value));
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      value = new MaximumOptionsT(*reinterpret_cast<MaximumOptionsT *>(u.value));
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new MaximumMinimumOptionsT(*reinterpret_cast<MaximumMinimumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new ArgMaxOptionsT(*reinterpret_cast<ArgMaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      value = new LessOptionsT(*reinterpret_cast<LessOptionsT *>(u.value));
       break;
     }
     default:
@@ -6850,8 +7117,18 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<MaximumOptionsT *>(value);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<MaximumMinimumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<ArgMaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessOptions: {
+      auto ptr = reinterpret_cast<LessOptionsT *>(value);
       delete ptr;
       break;
     }
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 62f20638bac943e9f674087e46c18233e8b09d63..bd888a415b035917e64932103740890876eb493e 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -18,6 +18,7 @@ gen_zipped_test_files(
     name = "optest",
     files = [
         "add.zip",
+        "arg_max.zip",
         "avg_pool.zip",
         "batch_to_space_nd.zip",
         "concat.zip",
@@ -33,14 +34,15 @@ gen_zipped_test_files(
         "global_batch_norm.zip",
         "l2_pool.zip",
         "l2norm.zip",
+        "less.zip",
         "local_response_norm.zip",
         "log_softmax.zip",
         "max_pool.zip",
         "maximum.zip",
         "mean.zip",
+        "minimum.zip",
         "mul.zip",
         "pad.zip",
-        "prelu.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
@@ -160,6 +162,9 @@ cc_test(
     size = "small",
     srcs = ["tflite_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":tflite_driver",
         "@com_google_googletest//:gtest_main",
@@ -194,7 +199,6 @@ cc_library(
 
 cc_library(
     name = "util",
-    testonly = 1,
     hdrs = ["util.h"],
 )
 
@@ -249,7 +253,6 @@ cc_test(
 
 cc_library(
     name = "generate_testspec",
-    testonly = 1,
     srcs = ["generate_testspec.cc"],
     hdrs = ["generate_testspec.h"],
     deps = [
@@ -275,7 +278,6 @@ cc_test(
 
 cc_library(
     name = "tflite_diff_util",
-    testonly = 1,
     srcs = ["tflite_diff_util.cc"],
     hdrs = ["tflite_diff_util.h"],
     deps = [
@@ -293,7 +295,6 @@ cc_library(
 
 cc_library(
     name = "tflite_diff_flags",
-    testonly = 1,
     hdrs = ["tflite_diff_flags.h"],
     deps = [
         ":split",
@@ -336,6 +337,15 @@ tf_cc_test(
     ],
 )
 
+cc_binary(
+    name = "tflite_diff",
+    srcs = ["tflite_diff_example_test.cc"],
+    deps = [
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
 tf_cc_test(
     name = "generated_examples_zip_test",
     size = "large",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 80450524520892bb6f9d41f0d2c79f355ca3af15..e045c27427f1cc40c824c74ca19f0426075c650d 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -17,10 +17,9 @@
 
 Usage:
 
-generate_examples <output directory> zipped
+generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
-    third_party/tensorflow/contrib/lite/testing/generated_examples zipped
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -52,8 +51,6 @@ from tensorflow.python.ops import rnn
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-# TODO(ahentz): remove this flag
-parser.add_argument("type", help="zipped")
 parser.add_argument("--zip_to_output",
                     type=str,
                     help="Particular zip to output.",
@@ -96,9 +93,6 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping. This catches test cases with
-    # const tensors as crops.
-    r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
@@ -107,6 +101,10 @@ KNOWN_BUGS = {
     r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Needs support for dimensions other than the last one in argmax.
+    r"arg_max.*axis=0.*": "77546240",
+    r"arg_max.*axis=1.*": "77546240",
+    r"arg_max.*axis=2.*": "77546240",
 }
 
 
@@ -543,6 +541,18 @@ def make_pool_tests(pool_op_in):
   return f
 
 
+def make_l2_pool_tests(zip_path):
+  make_pool_tests(make_l2_pool)(zip_path)
+
+
+def make_avg_pool_tests(zip_path):
+  make_pool_tests(tf.nn.avg_pool)(zip_path)
+
+
+def make_max_pool_tests(zip_path):
+  make_pool_tests(tf.nn.max_pool)(zip_path)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -617,54 +627,6 @@ def make_relu6_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_prelu_tests(zip_path):
-  """Make a set of tests to do PReLU."""
-
-  test_parameters = [{
-      # The canonical case for image processing is having a 4D `input` (NHWC)
-      # and `shared_axes`=[1, 2], so the alpha parameter is per channel.
-      "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
-      "shared_axes": [[1, 2], [1]],
-  }]
-
-  def build_graph(parameters):
-    """Build the graph for the test case."""
-
-    input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    prelu = tf.keras.layers.PReLU(shared_axes=parameters["shared_axes"])
-    out = prelu(input_tensor)
-    return [input_tensor], [out]
-
-  def build_inputs(parameters, sess, inputs, outputs):
-    """Build the inputs for the test case."""
-
-    input_shape = parameters["input_shape"]
-    input_values = create_tensor_data(
-        np.float32, input_shape, min_value=-10, max_value=10)
-    shared_axes = parameters["shared_axes"]
-
-    alpha_shape = []
-    for dim in range(1, len(input_shape)):
-      alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
-
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
-
-    with tf.variable_scope("", reuse=True):
-      alpha = tf.get_variable("p_re_lu/alpha")
-      sess.run(alpha.assign(alpha_values))
-
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
-
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      use_frozen_graph=True)
-
-
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -897,11 +859,62 @@ def make_maximum_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_minimum_tests(zip_path):
+  """Make a set of tests to do minimum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the minimum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.minimum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_binary_op_tests_func(binary_operator):
   """Return a function that does a test on a binary operator."""
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
 
 
+def make_add_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.add)
+
+
+def make_div_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.div)
+
+
+def make_sub_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.subtract)
+
+
+def make_mul_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.multiply)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1026,6 +1039,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[1, 3, 4, 3]],
           "filter_shape": [[1, 1, 3, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -1034,6 +1048,7 @@ def make_conv_tests(zip_path):
           "input_shape": [[2, 14, 14, 2]],
           "filter_shape": [[6, 6, 2, 2]],
           "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "dilations": [[1, 1, 1, 1], [1, 2, 2, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
           "constant_filter": [True, False],
@@ -1059,6 +1074,7 @@ def make_conv_tests(zip_path):
         input_tensor,
         filter_input,
         strides=parameters["strides"],
+        dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     return input_tensors, [out]
@@ -1169,7 +1185,7 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_concatenation_tests(zip_path):
+def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
   test_parameters = [{
@@ -1579,7 +1595,7 @@ def make_batch_to_space_nd_tests(zip_path):
   test_parameters = [
       {
           "dtype": [tf.float32, tf.int64, tf.int32],
-          "input_shape": [[12, 2, 2, 1]],
+          "input_shape": [[12, 3, 3, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
           "constant_block_shape": [True, False],
@@ -1929,7 +1945,7 @@ def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
 
 
 def make_topk_tests(zip_path):
-  """Make a set of tests to do gather."""
+  """Make a set of tests to do topk."""
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
@@ -1937,7 +1953,7 @@ def make_topk_tests(zip_path):
   }]
 
   def build_graph(parameters):
-    """Build the gather op testing graph."""
+    """Build the topk op testing graph."""
     input_value = tf.placeholder(
         dtype=parameters["input_dtype"],
         name="input",
@@ -1954,6 +1970,69 @@ def make_topk_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
+def make_arg_max_tests(zip_path):
+  """Make a set of tests to do arg_max."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "axis": [0, 1, 2, 3],
+      "output_type": [tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the topk op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis = tf.constant(parameters["axis"], name="axis")
+    out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_less_tests(zip_path):
+  """Make a set of tests to do less."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
@@ -1966,69 +2045,26 @@ def main(unused_args):
       if not os.path.isdir(x):
         raise RuntimeError("Failed to create dir %r" % x)
 
-  if FLAGS.type == "zipped":
-    opstest_path = os.path.join(FLAGS.output_path)
-    mkdir_if_not_exist(opstest_path)
-    def _path(filename):
-      return os.path.join(opstest_path, filename)
-
-    dispatch = {
-        "control_dep.zip": make_control_dep_tests,
-        "add.zip": make_binary_op_tests_func(tf.add),
-        "space_to_batch_nd.zip": make_space_to_batch_nd_tests,
-        "div.zip": make_binary_op_tests_func(tf.div),
-        "sub.zip": make_binary_op_tests_func(tf.subtract),
-        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
-        "conv.zip": make_conv_tests,
-        "constant.zip": make_constant_tests,
-        "depthwiseconv.zip": make_depthwiseconv_tests,
-        "concat.zip": make_concatenation_tests,
-        "fully_connected.zip": make_fully_connected_tests,
-        "global_batch_norm.zip": make_global_batch_norm_tests,
-        "gather.zip": make_gather_tests,
-        "fused_batch_norm.zip": make_fused_batch_norm_tests,
-        "l2norm.zip": make_l2norm_tests,
-        "local_response_norm.zip": make_local_response_norm_tests,
-        "mul.zip": make_binary_op_tests_func(tf.multiply),
-        "relu.zip": make_relu_tests,
-        "relu1.zip": make_relu1_tests,
-        "relu6.zip": make_relu6_tests,
-        "prelu.zip": make_prelu_tests,
-        "l2_pool.zip": make_pool_tests(make_l2_pool),
-        "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
-        "max_pool.zip": make_pool_tests(tf.nn.max_pool),
-        "pad.zip": make_pad_tests,
-        "reshape.zip": make_reshape_tests,
-        "resize_bilinear.zip": make_resize_bilinear_tests,
-        "sigmoid.zip": make_sigmoid_tests,
-        "softmax.zip": make_softmax_tests,
-        "space_to_depth.zip": make_space_to_depth_tests,
-        "topk.zip": make_topk_tests,
-        "split.zip": make_split_tests,
-        "transpose.zip": make_transpose_tests,
-        "mean.zip": make_mean_tests,
-        "squeeze.zip": make_squeeze_tests,
-        "strided_slice.zip": make_strided_slice_tests,
-        "exp.zip": make_exp_tests,
-        "log_softmax.zip": make_log_softmax_tests,
-        "lstm.zip": make_lstm_tests,
-        "maximum.zip": make_maximum_tests,
-    }
-    out = FLAGS.zip_to_output
-    bin_path = FLAGS.toco
-    if out in dispatch:
-      dispatch[out](_path(out))
-    else:
-      raise RuntimeError("Invalid zip to output %r" % out)
+  opstest_path = os.path.join(FLAGS.output_path)
+  mkdir_if_not_exist(opstest_path)
 
-  else:
-    raise RuntimeError("Invalid argument for type of generation.")
+  out = FLAGS.zip_to_output
+  bin_path = FLAGS.toco
+  test_function = ("make_%s_tests" % out.replace(".zip", ""))
+  if test_function not in globals():
+    raise RuntimeError("Can't find a test function to create %r. Tried %r" %
+                       (out, test_function))
+
+  # TODO(ahentz): accessing globals() is not very elegant. We should either
+  # break this file into multiple tests or use decorator-based registration to
+  # avoid using globals().
+  globals()[test_function](os.path.join(opstest_path, out))
 
 
 if __name__ == "__main__":
   FLAGS, unparsed = parser.parse_known_args()
 
   if unparsed:
-    print("Usage: %s <path out> zipped <zip file to generate>")
+    print("Usage: %s <path out> <zip file to generate>")
   else:
     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index eb3deafb6986e877f0a553a8b6f712102af4caca..6580845af42b3cdded19b578b41c682089aaf9ef 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -22,7 +22,22 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-void GenerateTestSpecFromTensorflowModel(
+template <typename T>
+void GenerateCsv(const std::vector<int>& shape, float min, float max,
+                 string* out) {
+  auto random_float = [](int min, int max) {
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(shape, random_t);
+  *out = Join(data.data(), data.size(), ",");
+}
+
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
@@ -31,12 +46,6 @@ void GenerateTestSpecFromTensorflowModel(
   CHECK_EQ(input_layer.size(), input_layer_type.size());
   CHECK_EQ(input_layer.size(), input_layer_shape.size());
 
-  // Initialize random functions.
-  static unsigned int seed = 0;
-  std::function<float(int)> float_rand = [](int idx) {
-    return static_cast<float>(rand_r(&seed)) / RAND_MAX - 0.5f;
-  };
-
   // Generate inputs.
   std::vector<string> input_values;
   input_values.resize(input_layer.size());
@@ -46,15 +55,25 @@ void GenerateTestSpecFromTensorflowModel(
     auto shape = Split<int>(input_layer_shape[i], ",");
 
     switch (type) {
-      case tensorflow::DT_FLOAT: {
-        const auto& data = GenerateRandomTensor<float>(shape, float_rand);
-        input_values[i] = Join(data.data(), data.size(), ",");
+      case tensorflow::DT_FLOAT:
+        GenerateCsv<float>(shape, -0.5, 0.5, &input_values[i]);
+        break;
+      case tensorflow::DT_UINT8:
+        GenerateCsv<uint8_t>(shape, 0, 255, &input_values[i]);
+        break;
+      case tensorflow::DT_INT32:
+        GenerateCsv<int32_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_INT64:
+        GenerateCsv<int64_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_BOOL:
+        GenerateCsv<int>(shape, 0.01, 1.99, &input_values[i]);
         break;
-      }
       default:
-
-        fprintf(stderr, "Unsupported type %d when generating testspec\n", type);
-        return;
+        fprintf(stderr, "Unsupported type %d (%s) when generating testspec.\n",
+                type, input_layer_type[i].c_str());
+        return false;
     }
   }
 
@@ -82,6 +101,8 @@ void GenerateTestSpecFromTensorflowModel(
     stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
   }
   stream << "}\n";
+
+  return true;
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
index 3529ee709b66625fff6e2a35b78e47f3778f0fe7..6e31a853c3f7f82a89126ff83af784ffd418741a 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -34,7 +34,7 @@ namespace testing {
 //   input_layer_type: datatypes of input tensors. Example: float
 //   input_layer_shape: shapes of input tensors, separated by comma. example:
 //   1,3,4 output_layer: names of output tensors. Example: output
-void GenerateTestSpecFromTensorflowModel(
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 6697b86e798756bf3273e36dc105eee17d146aa6..9da8bd7a28891fc4a534710fd0fd8344cde1b197 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -94,6 +94,11 @@ std::map<string, string> kBrokenTests = {
 
     // No support for axis!=0 in GatherV2.
     {R"(^\/gather.*axis=1)", "76910444"},
+
+    // No support for arbitrary dimensions in ArgMax.
+    {R"(^\/arg_max.*axis=0)", "77546240"},
+    {R"(^\/arg_max.*axis=1)", "77546240"},
+    {R"(^\/arg_max.*axis=2)", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -236,6 +241,7 @@ TEST_P(OpsTest, RunStuff) {
       ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
 
 INSTANTIATE_TESTS(add)
+INSTANTIATE_TESTS(arg_max)
 INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
@@ -256,11 +262,12 @@ INSTANTIATE_TESTS(log_softmax)
 INSTANTIATE_TESTS(maximum)
 INSTANTIATE_TESTS(max_pool)
 INSTANTIATE_TESTS(mean)
+INSTANTIATE_TESTS(minimum)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
-INSTANTIATE_TESTS(prelu)
+// INSTANTIATE_TESTS(prelu)
 INSTANTIATE_TESTS(relu6)
 INSTANTIATE_TESTS(reshape)
 INSTANTIATE_TESTS(resize_bilinear)
@@ -273,6 +280,7 @@ INSTANTIATE_TESTS(squeeze)
 INSTANTIATE_TESTS(strided_slice)
 INSTANTIATE_TESTS(sub)
 INSTANTIATE_TESTS(transpose)
+INSTANTIATE_TESTS(less)
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
index 428cfda4f216f0ee6409a32c43a4cf91ecc11922..896f2949efa6aeda76940bae18a11dccf3c2f01b 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -80,6 +80,16 @@ inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
   return fields;
 }
 
+template <>
+inline std::vector<bool> Split(const string& s, const string& delimiter) {
+  std::vector<bool> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(
+        static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
+  }
+  return fields;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/split_test.cc b/tensorflow/contrib/lite/testing/split_test.cc
index 3d1e25d9c7dab50984928adfe0d7392675578662..76b918cbcd83ef43c52057b84bcc2a8f4ff6b8f7 100644
--- a/tensorflow/contrib/lite/testing/split_test.cc
+++ b/tensorflow/contrib/lite/testing/split_test.cc
@@ -52,6 +52,11 @@ TEST(SplitTest, SplitUint8) {
   EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
 }
 
+TEST(SplitTest, SplitBool) {
+  EXPECT_THAT(Split<bool>("1, 0, 0, 1", ","),
+              ElementsAre(true, false, false, true));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 2c253bb1983e5ddc5bc12858c929585d1bcee710..7b295875aab12bf48da2341ce05dd53442464cf0 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -87,10 +87,9 @@ TfDriver::TfDriver(const std::vector<string>& input_layer,
 
 void TfDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
   std::ifstream model(bin_file_path);
   if (model.fail()) {
-    Invalidate("Failed to find the model");
+    Invalidate("Failed to find the model " + bin_file_path);
     return;
   }
 
@@ -121,6 +120,10 @@ void TfDriver::SetInput(int id, const string& csv_values) {
       FillTensorWithData<int32_t>(&tensor, csv_values);
       break;
     }
+    case tensorflow::DT_UINT8: {
+      FillTensorWithData<uint8_t>(&tensor, csv_values);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
@@ -162,6 +165,8 @@ string TfDriver::ReadOutput(int id) {
       return TensorDataToCsvString<float>(output_tensors_[id]);
     case tensorflow::DT_INT32:
       return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+    case tensorflow::DT_UINT8:
+      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
     default:
       fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
index 3817e68111dbaaf2a38ceff9fbc38f30f303cb5f..5afa0f800cdaa8bf70a11cb6e2ac64ace8138e79 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
@@ -19,10 +19,13 @@ limitations under the License.
 int main(int argc, char** argv) {
   ::tflite::testing::DiffOptions options =
       ::tflite::testing::ParseTfliteDiffFlags(&argc, argv);
+  if (options.tensorflow_model.empty()) return 1;
+  int failure_count = 0;
   for (int i = 0; i < 100; i++) {
     if (!tflite::testing::RunDiffTest(options)) {
-      return 1;
+      ++failure_count;
     }
   }
-  return 0;
+  fprintf(stderr, "Num errors: %d\n", failure_count);
+  return failure_count != 0 ? 1 : 0;
 }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 5f1129d501b7235f1202b704cf36904e07b8720e..706108ed73bb3fd9bd784cffffe322d6981433e6 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -51,9 +51,11 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "output_1,output_2"),
   };
 
+  bool no_inputs = *argc == 1;
   bool success = tensorflow::Flags::Parse(argc, argv, flags);
-  if (!success || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+  if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
   }
 
   return {values.tensorflow_model,
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
index 9ef4e1f66c7d31c746c18d63495e760585d4af9e..f601d3752ddb5df9f2b5ac73d9bc303efaade4a5 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -27,13 +27,13 @@ namespace testing {
 
 bool RunDiffTest(const DiffOptions& options) {
   std::stringstream tflite_stream;
-  GenerateTestSpecFromTensorflowModel(
-      tflite_stream, options.tensorflow_model, options.tflite_model,
-      options.input_layer, options.input_layer_type, options.input_layer_shape,
-      options.output_layer);
+  if (!GenerateTestSpecFromTensorflowModel(
+          tflite_stream, options.tensorflow_model, options.tflite_model,
+          options.input_layer, options.input_layer_type,
+          options.input_layer_shape, options.output_layer))
+    return false;
   TfLiteDriver tflite_driver(/*use_nnapi=*/true);
   tflite_driver.LoadModel(options.tflite_model);
-  std::cout << tflite_stream.str();
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index c399f4f2b78d7420ac6ea7098ed44b2122216279..58fe5bd6e40b3d5979d64fae659eb39bfe87c265 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -42,6 +42,10 @@ template <>
 uint8_t Value(const TfLitePtrUnion& data, int index) {
   return data.uint8[index];
 }
+template <>
+bool Value(const TfLitePtrUnion& data, int index) {
+  return data.b[index];
+}
 
 template <typename T>
 void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
@@ -79,6 +83,8 @@ class TfLiteDriver::Expectation {
         return TypedCheck<int64_t>(verbose, tensor);
       case kTfLiteUInt8:
         return TypedCheck<uint8_t>(verbose, tensor);
+      case kTfLiteBool:
+        return TypedCheck<bool>(verbose, tensor);
       default:
         fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
@@ -143,7 +149,6 @@ void TfLiteDriver::AllocateTensors() {
 
 void TfLiteDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
 
   model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
   if (!model_) {
@@ -204,6 +209,12 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteBool: {
+      const auto& values = testing::Split<bool>(csv_values, ",");
+      if (!CheckSizes<bool>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
@@ -232,6 +243,9 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteUInt8:
       expected_output_[id]->SetData<uint8_t>(csv_values);
       break;
+    case kTfLiteBool:
+      expected_output_[id]->SetData<bool>(csv_values);
+      break;
     default:
       fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 8a35fb9034ca9cd1b9eb87956aed1eb96485dc9b..5b86e4e5aeee68f165690fbe7a5368cb85f170c3 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -219,6 +219,7 @@ cc_library(
         "graph_transformations/drop_fake_quant.cc",
         "graph_transformations/drop_im2col_arrays.cc",
         "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/experimental_shuffle_fc_weights.cc",
         "graph_transformations/fuse_activation_functions.cc",
         "graph_transformations/fuse_binary_into_following_affine.cc",
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
@@ -238,6 +239,8 @@ cc_library(
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
+        "graph_transformations/quantization_util.cc",
+        "graph_transformations/quantization_util.h",
         "graph_transformations/quantize.cc",
         "graph_transformations/read_fake_quant_min_max.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
@@ -249,6 +252,7 @@ cc_library(
         "graph_transformations/remove_trivial_passthrough.cc",
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
+        "graph_transformations/remove_trivial_quantized_min_max.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
@@ -263,6 +267,7 @@ cc_library(
         "graph_transformations/resolve_constant_gather.cc",
         "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
+        "graph_transformations/resolve_constant_reshape.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4a77196aab760b1b4711db123afcb45bfe779719..99ccfaea648077b7b72af30b32dd53b42b85d3a2 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -704,6 +704,15 @@ void ConvertRelu6Operator(const Relu6Operator& src_op,
   (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogOperator(const LogOperator& src_op, GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("Log");
+  op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertLogisticOperator(const LogisticOperator& src_op,
                              GraphDef* tensorflow_graph) {
   auto* relu_op = tensorflow_graph->add_node();
@@ -874,6 +883,9 @@ void ConvertFakeQuantOperator(const FakeQuantOperator& src_op,
   CHECK(src_op.minmax);
   (*fakequant_op->mutable_attr())["min"].set_f(src_op.minmax->min);
   (*fakequant_op->mutable_attr())["max"].set_f(src_op.minmax->max);
+  if (src_op.num_bits) {
+    (*fakequant_op->mutable_attr())["num_bits"].set_i(src_op.num_bits);
+  }
 }
 
 void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
@@ -1703,6 +1715,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kRelu6) {
     ConvertRelu6Operator(static_cast<const Relu6Operator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLog) {
+    ConvertLogOperator(static_cast<const LogOperator&>(src_op),
+                       tensorflow_graph);
   } else if (src_op.type == OperatorType::kLogistic) {
     ConvertLogisticOperator(static_cast<const LogisticOperator&>(src_op),
                             tensorflow_graph);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f098981a5cf4b91df4c7798bd3db8563705a3bd0
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
+  Operator* op = model->operators[op_index].get();
+  if (op->type != OperatorType::kFullyConnected) {
+    return false;
+  }
+  FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
+  // Exit if this FC op already has shuffled weights
+  if (fc_op->experimental_shuffled_weights) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(fc_op->inputs[0]);
+  const string& weights_name = fc_op->inputs[1];
+  Array& weights_array = model->GetArray(weights_name);
+  const Array& output_array = model->GetArray(fc_op->outputs[0]);
+  // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
+  // the only case where we are currently interested in providing a fast path
+  // with shuffled weights.
+  if (input_array.data_type != ArrayDataType::kUint8 ||
+      weights_array.data_type != ArrayDataType::kUint8 ||
+      output_array.data_type != ArrayDataType::kInt16 ||
+      !input_array.quantization_params || !weights_array.quantization_params ||
+      !output_array.quantization_params) {
+    return false;
+  }
+  // Exit if the shapes aren't known
+  if (!input_array.has_shape() || !weights_array.has_shape()) {
+    return false;
+  }
+  // Exit if, based on the known shapes, this FC op is not a GEMV.
+  // The shuffling of FC weights is only useful to enable fast GEMV paths.
+  const Shape& input_shape = input_array.shape();
+  for (int i = 0; i < input_shape.dimensions_count() - 1; i++) {
+    if (input_shape.dims(i) != 1) {
+      // The input activations, shaped as a matrix, have multiple columns.
+      // This FC op isn't a matrix*vector multiplication.
+      AddMessageF(
+          "Not applying experimental shuffling to the weights of %s because "
+          "it's not a matrix*vector product",
+          LogName(*op));
+      return false;
+    }
+  }
+  // Exit if the weights shape isn't an integral multiple of the shuffled
+  // block shape, 4x16. We don't want to have to write code dealing with
+  // odd sizes, that would go un-exercised at the moment as the models
+  // for which we need this shuffling have shapes that are multiples of that
+  // 4x16 block size. In fact, much of the rationale for this shuffling is
+  // to avoid cache aliasin issue with large power-of-two depths, with our
+  // models motivating this shuffling having FC weights shapes like
+  // 4096x2048. Thus, if some model doesn't get the shuffling because of that
+  // size requirement, that might be just fine --- that model might just not
+  // suffer from that cache aliasing issue that we have with large powers of
+  // two.
+  const Shape& weights_shape = weights_array.shape();
+  if (weights_shape.dimensions_count() != 2) {
+    return false;
+  }
+  const int rows = weights_shape.dims(0);
+  const int cols = weights_shape.dims(1);
+  if (rows % 4 || cols % 16) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because its "
+        "shape isn't a multiple of the shuffling block shape, 4x16",
+        LogName(*op));
+    return false;
+  }
+  // Exit if the weights aren't already a constant array.
+  if (!weights_array.buffer) {
+    return false;
+  }
+  // Exit if the weights are used by more than one op.
+  if (CountOpsWithInput(*model, weights_name) != 1) {
+    AddMessageF(
+        "Not applying experimental shuffling to the weights of %s because that "
+        "array is consumed by other operators",
+        LogName(*op));
+    return false;
+  }
+  // Compute the shuffled weights
+  auto& weights_data =
+      weights_array.GetMutableBuffer<ArrayDataType::kUint8>().data;
+  CHECK_EQ(rows * cols, weights_data.size());
+  std::vector<uint8> shuffled_data(weights_data.size());
+  uint8* shuffled_data_ptr = shuffled_data.data();
+  for (int r = 0; r < rows; r += 4) {
+    for (int c = 0; c < cols; c += 16) {
+      for (int i = 0; i < 4; i++) {
+        const uint8* src_data_ptr = weights_data.data() + (r + i) * cols + c;
+        for (int j = 0; j < 16; j++) {
+          uint8 src_val = *src_data_ptr++;
+          // Flip the sign bit, so that the runtime will only need to
+          // reinterpret these uint8 values as int8, getting for free the
+          // subtraction of the zero_point value 128.
+          uint8 dst_val = src_val ^ 0x80;
+          *shuffled_data_ptr++ = dst_val;
+        }
+      }
+    }
+  }
+  CHECK_EQ(shuffled_data_ptr, shuffled_data.data() + rows * cols);
+  // Switch this FC op to using the shuffled weights.
+  weights_data = std::move(shuffled_data);
+  fc_op->experimental_shuffled_weights = true;
+  AddMessageF("Applied experimental shuffling to the weights of %s",
+              LogName(*op));
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 27c5044bb3e06e4a052ff0c4984226fb9d113f95..dbf029a8539d6b57fb46a43ae8a697e90d262fc1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -146,6 +146,7 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedMinMax)
 DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantBinaryOperator)
@@ -164,6 +165,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantReshape)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
@@ -185,6 +187,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
 DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
+DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
 
 class ResolveReshapeAttributes : public GraphTransformation {
  public:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index c363b93394f0af7bcfc37c1e8be5f98aca6667ae..e9842524c829b839b97b3453a36c41efe186efbb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -306,6 +306,12 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
+  if (static_cast<FullyConnectedOperator*>(fully_connected)
+          ->experimental_shuffled_weights) {
+    // Not yet implemented: experimental shuffled weights in fused LSTM cell.
+    return false;
+  }
+
   // Emplace a new LSTM cell operator
   auto* lstm_cell_op = new LstmCellOperator;
   lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 68d6f21cf847bfb112213cabd326854bb174cfc1..9191e696629f42faf89289f53f9dbc97f4527ce9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1060,17 +1060,15 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   }
   QCHECK(crops_array.data_type == ArrayDataType::kInt32);
   const auto& crops_data = crops_array.GetBuffer<ArrayDataType::kInt32>().data;
-  // We don't support crops now.
-  QCHECK_EQ(crops_data[0], 0);
-  QCHECK_EQ(crops_data[1], 0);
-  QCHECK_EQ(crops_data[2], 0);
-  QCHECK_EQ(crops_data[3], 0);
-
+  const int crops_top = crops_data[0];
+  const int crops_bottom = crops_data[1];
+  const int crops_left = crops_data[2];
+  const int crops_right = crops_data[3];
+  const int output_height =
+      input_height * block_height - crops_top - crops_bottom;
+  const int output_width = input_width * block_width - crops_left - crops_right;
   QCHECK_EQ(input_shape.dims(0) % (block_height * block_width), 0);
 
-  int output_height = input_height * block_height;
-  int output_width = input_width * block_width;
-
   model->GetArray(op->outputs[0])
       .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
                          output_height, output_width, input_shape.dims(3)}));
@@ -1479,6 +1477,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
+    case OperatorType::kLog:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e080df4bed5bd260f8748b9e7f39462133db80d5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value) {
+  switch (data_type) {
+    case ArrayDataType::kUint8:
+      *out_min_value = 0;
+      *out_max_value = 255;
+      return true;
+    case ArrayDataType::kInt16:
+      *out_min_value = -32768;
+      *out_max_value = 32767;
+      return true;
+    default:
+      return false;
+  }
+}
+
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type) {
+  switch (array.final_data_type) {
+    case ArrayDataType::kInt8:
+    case ArrayDataType::kUint8:
+    case ArrayDataType::kInt16:
+    case ArrayDataType::kUint16:
+    case ArrayDataType::kInt32:
+    case ArrayDataType::kUint32:
+    case ArrayDataType::kInt64:
+    case ArrayDataType::kUint64:
+      return array.final_data_type;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+      return default_type;
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(array.final_data_type);
+  }
+}
+
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params) {
+  switch (data_type) {
+    case ArrayDataType::kInt8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint8:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint16:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint16>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint32:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint32>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kInt64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kUint64:
+      GetQuantizationParamsFromMinMax<ArrayDataType::kUint64>(
+          minmax, quantization_params);
+      break;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(data_type);
+  }
+}
+
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max) {
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(array, array.data_type);
+  if (quantized_data_type == ArrayDataType::kNone ||
+      quantized_data_type == ArrayDataType::kFloat) {
+    // The array is not (or never will be) quantized.
+    return false;
+  }
+
+  QuantizationParams quantization_params;
+  if (!array.quantization_params) {
+    if (!array.minmax) {
+      transformation->AddMessageF("No quantization params and no minmax");
+      return false;
+    } else {
+      // Work around cases where we are asking for this prior to the Quantize
+      // transformation having added the quantization_params.
+      GetQuantizationParams(quantized_data_type, *array.minmax,
+                            &quantization_params);
+      transformation->AddMessageF(
+          "No quantization params - infering from data type %s with minmax "
+          "%g,%g as zero_point=%g, scale=%g",
+          ArrayDataTypeName(quantized_data_type), array.minmax->min,
+          array.minmax->max, quantization_params.zero_point,
+          quantization_params.scale);
+    }
+  } else {
+    quantization_params = array.GetQuantizationParams();
+  }
+
+  double quantized_min, quantized_max;
+  CHECK(GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                           &quantized_max))
+      << "Type is not quantized";
+
+  bool has_nontrivial_min_bound = false;
+  bool has_nontrivial_max_bound = false;
+
+  double lowest_representable_output =
+      (quantized_min - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (lowest_representable_output < clamp_min) {
+    has_nontrivial_min_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the lowest representable output value %g"
+        " less than the clamp min bound %g.",
+        lowest_representable_output, clamp_min);
+  }
+
+  double highest_representable_output =
+      (quantized_max - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (highest_representable_output > clamp_max) {
+    has_nontrivial_max_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the highest representable output value %g"
+        " is greater than the clamp max bound %g.",
+        highest_representable_output, clamp_max);
+  }
+
+  return !has_nontrivial_min_bound && !has_nontrivial_max_bound;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..35fb3107775e0e42ddf5f1991e981e5129b5b974
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+// Gets the min/max numerical range for the given quantized data type.
+// For example, kUint8 will return [0,255].
+// Returns true if the ranges were set and false if the type is not quantized.
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value);
+
+// Returns the quantized data type of an array, falling back to the provided
+// default data type.
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type);
+
+// Gets the quantization params for the array with the given data type and
+// minmax.
+void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
+                           QuantizationParams* quantization_params);
+
+// Returns true if the given array, when quantized, contains only values between
+// the provided clamp min/max.
+// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
+// is unbounded on that side.
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 5b1268f9a9bf49862e072f305836ae93cdd0344b..d6cae3cdbf68a3d8589d45e92194cc9d14b6d3bf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -44,6 +45,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowMinimum ||
          type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
@@ -204,70 +206,6 @@ QuantizationPoints GetQuantizationPoints(ArrayDataType data_type) {
   }
 }
 
-ArrayDataType GetQuantizedDataType(const Array& array,
-                                   ArrayDataType default_type) {
-  switch (array.final_data_type) {
-    case ArrayDataType::kInt8:
-    case ArrayDataType::kUint8:
-    case ArrayDataType::kInt16:
-    case ArrayDataType::kUint16:
-    case ArrayDataType::kInt32:
-    case ArrayDataType::kUint32:
-    case ArrayDataType::kInt64:
-    case ArrayDataType::kUint64:
-      return array.final_data_type;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-      return default_type;
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(array.final_data_type);
-  }
-}
-
-void GetQuantizationParams(ArrayDataType data_type, const MinMax& minmax,
-                           QuantizationParams* quantization_params) {
-  switch (data_type) {
-    case ArrayDataType::kInt8:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt8>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint8:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt16:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint16:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint16>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt32:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt32>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint32:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint32>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kInt64:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kInt64>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kUint64:
-      GetQuantizationParamsFromMinMax<ArrayDataType::kUint64>(
-          minmax, quantization_params);
-      break;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(data_type);
-  }
-}
-
 bool ChooseQuantizationForOperatorInput(
     GraphTransformation* transformation, Model* model, const Operator& op,
     std::size_t input_index, ArrayDataType* quantized_data_type,
@@ -335,12 +273,11 @@ bool ChooseQuantizationForOperatorInput(
   *quantized_data_type = GetQuantizedDataType(array, ArrayDataType::kUint8);
   GetQuantizationParams(*quantized_data_type, minmax, quantization_params);
   transformation->AddMessageF(
-      "For input array %s with min=%g"
-      ", max=%g"
-      ", chose to quantize as %s with zero_point=%d"
-      ", scale=%g",
+      "For input array %s with min=%g, max=%g, chose to quantize as %s (f=%s) "
+      "with zero_point=%d, scale=%g",
       input, minmax.min, minmax.max, ArrayDataTypeName(*quantized_data_type),
-      quantization_params->zero_point, quantization_params->scale);
+      ArrayDataTypeName(array.final_data_type), quantization_params->zero_point,
+      quantization_params->scale);
   return true;
 }
 
@@ -394,6 +331,19 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kLogSoftmax) {
+    // LogSoftmax has range: [LogSoftmaxOperator::kOutputRangeMin, 0].
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.max_value;
+    quantization_params->scale =
+        -LogSoftmaxOperator::kOutputRangeMin / (qp.max_value + 1);
+    // While not strictly necessary, it is easier to interpret output data and
+    // quantization if the scale is similar to others (such as power of 2).
+    CHECK(IsExactlyRepresentable(LogSoftmaxOperator::kOutputRangeMin / 2,
+                                 *quantized_data_type, *quantization_params));
+    return true;
+  }
   if (op.type == OperatorType::kTanh) {
     // Tanh has the range: [-1, 1].
     *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
@@ -511,6 +461,7 @@ void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
     minmax->max = max;
   }
 }
+
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
@@ -661,6 +612,8 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
 
       // Fix up the min/max information on the output array to match the chosen
       // quantization parameters.
+      CHECK(output_array.minmax)
+          << "Output array named " << output << " lacks minmax";
       auto& output_minmax = output_array.GetMinMax();
       FixMinMaxPostQuantization(quantized_data_type, quantization_params,
                                 &output_minmax);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index aa93ace03af300f9cbd3f9c6620a6a58b9329aa4..3e021b819fc82d66fb70596a62fd7cee4911d4e8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -82,22 +82,13 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
-        "Removing %s, keeping its non-constant input array",
-        LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) && input != main_input_name &&
-          CountOpsWithInput(*model, input) == 1) {
-      }
-    }
+        "Removing %s, keeping its non-constant input array %s and removing %s",
+        LogName(*passthru_op), main_input_name, output_name);
     RerouteEdges(output_name, main_input_name, model);
   } else if (IsDiscardableArray(*model, main_input_name)) {
-    transformation->AddMessageF("Removing %s, keeping its output array",
-                                LogName(*passthru_op));
-    for (const string& input : passthru_op->inputs) {
-      if (IsDiscardableArray(*model, input) &&
-          (input == main_input_name || CountOpsWithInput(*model, input) == 1)) {
-      }
-    }
+    transformation->AddMessageF(
+        "Removing %s, keeping its output array %s and removing input %s",
+        LogName(*passthru_op), output_name, main_input_name);
     RerouteEdges(main_input_name, output_name, model);
   } else {
     transformation->AddMessageF(
@@ -113,6 +104,16 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
   // Remove any array that is no longer used.
   for (const string& removal_candidate : removal_candidates) {
     bool is_referenced = false;
+    for (const auto& array : model->flags.input_arrays()) {
+      if (array.name() == removal_candidate) {
+        is_referenced = true;
+      }
+    }
+    for (const auto& array_name : model->flags.output_arrays()) {
+      if (array_name == removal_candidate) {
+        is_referenced = true;
+      }
+    }
     for (const auto& op : model->operators) {
       for (const string& input : op->inputs) {
         if (input == removal_candidate) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 9b65feaa6443cd32ac1bef961600ff225d52d4b2..752560e075a087bcc2b0a3cb19dad484fb582d42 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/runtime/types.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
@@ -26,27 +28,44 @@ limitations under the License.
 
 namespace toco {
 
-bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
-                                               std::size_t op_index) {
-  const auto it = model->operators.begin() + op_index;
-  auto* op = it->get();
-  if (op->fused_activation_function != FusedActivationFunctionType::kRelu &&
-      op->fused_activation_function != FusedActivationFunctionType::kRelu1 &&
-      op->fused_activation_function != FusedActivationFunctionType::kRelu6) {
-    return false;
-  }
-  const auto& output_array = model->GetArray(op->outputs[0]);
-  if (!output_array.quantization_params) {
-    return false;
-  }
-  if (output_array.data_type != ArrayDataType::kUint8) {
-    return false;
+namespace {
+
+bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
+                                    const Model& model, OperatorType op_type,
+                                    const string& input_array_name) {
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kRelu:
+      clamp_min = 0.0;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    case OperatorType::kRelu1:
+      clamp_min = -1.0;
+      clamp_max = 1.0;
+      break;
+    case OperatorType::kRelu6:
+      clamp_min = 0.0;
+      clamp_max = 6.0;
+      break;
+    default:
+      return false;
   }
-  const auto& quantization_params = output_array.GetQuantizationParams();
 
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+bool IsTrivialFusedActivationFunc(
+    GraphTransformation* transformation, const Model& model,
+    FusedActivationFunctionType activation_function,
+    const string& output_array_name) {
   double clamp_min;
   double clamp_max;
-  switch (op->fused_activation_function) {
+  switch (activation_function) {
+    case FusedActivationFunctionType::kNone:
+      return false;
     case FusedActivationFunctionType::kRelu:
       clamp_min = 0.0;
       clamp_max = std::numeric_limits<double>::infinity();
@@ -61,45 +80,46 @@ bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
       break;
     default:
       LOG(FATAL) << "Unsupported fused activation type: "
-                 << static_cast<int>(op->fused_activation_function);
+                 << static_cast<int>(activation_function);
       return false;
   }
 
-  bool has_nontrivial_min_bound = false;
-  bool has_nontrivial_max_bound = false;
+  const auto& output_array = model.GetArray(output_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, output_array, clamp_min,
+                                     clamp_max);
+}
 
-  double lowest_representable_output =
-      (0. - quantization_params.zero_point) * quantization_params.scale;
-  if (lowest_representable_output < clamp_min) {
-    has_nontrivial_min_bound = true;
-    AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the lowest representable output value %g"
-        " less than the clamp min bound %g.",
-        lowest_representable_output, clamp_min);
-  }
-  double highest_representable_output =
-      (255. - quantization_params.zero_point) * quantization_params.scale;
-  if (highest_representable_output > clamp_max) {
-    has_nontrivial_max_bound = true;
-    AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the highest representable output value %g"
-        " is greater than the clamp max bound %g.",
-        highest_representable_output, clamp_max);
-  }
+}  // namespace
 
-  if (has_nontrivial_min_bound || has_nontrivial_max_bound) {
+// Attempts to remove both fused and unfused activation functions if the
+// quantization params indicate that the representable values fall inside the
+// activation range.
+bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
+                                               std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if (op->inputs.empty()) {
     return false;
   }
 
-  op->fused_activation_function = FusedActivationFunctionType::kNone;
-  AddMessageF(
-      "Removing trivial quantized activation function on %s"
-      " because the output quantization parameters imply at least as tight"
-      " a clamp anyway.",
-      LogName(*op));
-  return true;
+  if (IsTrivialUnfusedActivationFunc(this, *model, op->type, op->inputs[0])) {
+    AddMessageF(
+        "Removing trivial unfused activation function %s because the input "
+        "minmax imply at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+  if (IsTrivialFusedActivationFunc(this, *model, op->fused_activation_function,
+                                   op->outputs[0])) {
+    op->fused_activation_function = FusedActivationFunctionType::kNone;
+    AddMessageF(
+        "Removing trivial quantized activation function on %s "
+        "because the output quantization parameters imply at least as tight "
+        "a clamp anyway.",
+        LogName(*op));
+    return true;
+  }
+  return false;
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaee1c662b7cedb2baec7be47e12e348c3e7b25c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
+                     OperatorType op_type, const string& input_array_name,
+                     const string& clamp_value_array_name) {
+  const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
+  if (!IsConstantParameterArray(model, clamp_value_array_name)) {
+    transformation->AddMessageF("Clip value array %s is non-constant",
+                                clamp_value_array_name);
+    return false;
+  }
+  const auto& clamp_value_buffer =
+      clamp_value_array.GetBuffer<ArrayDataType::kFloat>();
+  CHECK_EQ(clamp_value_buffer.Length(), 1);
+  float clamp_value = clamp_value_buffer.data[0];
+
+  double clamp_min;
+  double clamp_max;
+  switch (op_type) {
+    case OperatorType::kTensorFlowMinimum:
+      clamp_min = -std::numeric_limits<double>::infinity();
+      clamp_max = clamp_value;
+      break;
+    case OperatorType::kTensorFlowMaximum:
+      clamp_min = clamp_value;
+      clamp_max = std::numeric_limits<double>::infinity();
+      break;
+    default:
+      CHECK(false);
+      return false;
+  }
+
+  const auto& input_array = model.GetArray(input_array_name);
+  return IsArrayQuantizedRangeSubset(transformation, input_array, clamp_min,
+                                     clamp_max);
+}
+
+}  // namespace
+
+// Attempts to remove min/max functions if the quantization params indicate that
+// the representable values fall inside the clip range.
+bool RemoveTrivialQuantizedMinMax::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if ((op->type != OperatorType::kTensorFlowMinimum &&
+       op->type != OperatorType::kTensorFlowMaximum) ||
+      op->inputs.size() != 2) {
+    return false;
+  }
+  if (IsTrivialMinMax(this, *model, op->type, op->inputs[0], op->inputs[1])) {
+    AddMessageF(
+        "Removing trivial min/max %s because the quantization parameters imply "
+        "at least as tight a clamp anyway.",
+        LogName(*op));
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
index 61477d59aea2f11c6347b84d8863763a86c43558..e28d8cf01eafee64e08ac2cc4b43ea7c227456c2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -41,8 +41,8 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
         ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
       transformation->AddMessageF(
           "%s is trivial because its input and output shapes are equal up to "
-          "extending "
-          "by 1's, and we are told to aggressively discard such Reshape ops.",
+          "extending by 1's, and we are told to aggressively discard such "
+          "Reshape ops.",
           LogName(op));
       return true;
     }
@@ -80,6 +80,7 @@ bool RemoveTrivialReshape::Run(Model* model, std::size_t op_index) {
   }
 
   if (!IsReshapeTrivial(*model, *reshape_op, this)) {
+    AddMessageF("%s is not trivial", LogName(*reshape_op));
     return false;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index aa2c293382a98b476bee783ed8e177b19d35b858..8e6aaf544aa5310b4233d93e7bc8f484f6164b8a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -47,7 +47,8 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     bool found_output_as_rnn_state_array = false;
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.state_array()) {
-        CHECK(op->type == OperatorType::kFill);
+        CHECK(op->type == OperatorType::kFill ||
+              op->type == OperatorType::kTensorFlowIdentity);
         found_output_as_rnn_state_array = true;
         break;
       }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index fb109eb91b16e3a73005230f821c18b9ef82d2fb..2b3ee36ad10e24ab7367ca44c03a234688a63a9b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -33,7 +33,7 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   const auto* bn_op =
       static_cast<const BatchNormalizationOperator*>(bn_it->get());
 
-  const auto& mean_array = model->GetArray(bn_op->inputs[1]);
+  auto& mean_array = model->GetArray(bn_op->inputs[1]);
   const auto& multiplier_array = model->GetArray(bn_op->inputs[2]);
   const auto& offset_array = model->GetArray(bn_op->inputs[3]);
 
@@ -49,6 +49,13 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   CHECK(multiplier_array.data_type == ArrayDataType::kFloat);
   CHECK(offset_array.data_type == ArrayDataType::kFloat);
 
+  // This graph transformations will need to address constant buffers below,
+  // so we need to exit early if these buffers don't exist (i.e. if the params
+  // haven't yet been resolved as constants).
+  if (!mean_array.buffer || !multiplier_array.buffer || !offset_array.buffer) {
+    return false;
+  }
+
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
@@ -80,9 +87,15 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   DCHECK_EQ(bn_it->get(), bn_op);
 
   // Create the new param arrays
-  const auto& mean_shape = mean_array.shape();
+  auto& mean_shape = *mean_array.mutable_shape();
   const auto& multiplier_shape = multiplier_array.shape();
   const auto& offset_shape = offset_array.shape();
+  if (mean_shape.dims().empty()) {
+    *mean_shape.mutable_dims() = multiplier_shape.dims();
+    auto& data = mean_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+    CHECK_EQ(data.size(), 1);
+    data.resize(RequiredBufferSizeForShape(mean_shape), data[0]);
+  }
   CHECK(mean_shape.dims() == multiplier_shape.dims());
   CHECK(mean_shape.dims() == offset_shape.dims());
   const auto& param_shape = mean_shape;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 064810b53e7c3bee4601204c9dbd976c374a6a60..d916ae0ddf017fe6a2fb2709db6e9de8c258adfc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -105,7 +106,8 @@ void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
 // already set (e.g. because of previous pass in TOCO), it doesn't change it and
 // returns. Otherwise it uses the input arrays min and max values to compute the
 // concatenated array min and max.
-void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
+void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
+                                 const std::vector<Array*>& input_arrays,
                                  Array* concatenated_array) {
   CHECK(concatenated_array->data_type == ArrayDataType::kFloat);
   // If the minmax is already set, use it
@@ -125,6 +127,9 @@ void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
   MinMax& minmax = concatenated_array->GetOrCreateMinMax();
   minmax.min = concat_min;
   minmax.max = concat_max;
+
+  transformation->AddMessageF("Setting concatenated array min/max to %g,%g",
+                              concat_min, concat_max);
 }
 
 }  // namespace
@@ -161,11 +166,14 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
+  AddMessageF("Performing constant concat of %s into %s",
+              absl::StrJoin(concat_op->inputs, ", "), concatenated_array_name);
+
   switch (concatenated_array.data_type) {
     case ArrayDataType::kFloat:
       ConcatenateTensorBuffers<ArrayDataType::kFloat>(
           input_arrays, concatenation_axis, &concatenated_array);
-      SetMinMaxForConcatenedArray(input_arrays, &concatenated_array);
+      SetMinMaxForConcatenedArray(this, input_arrays, &concatenated_array);
       break;
     case ArrayDataType::kUint8:
       ConcatenateTensorBuffers<ArrayDataType::kUint8>(
@@ -189,13 +197,13 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    // Check to prevent removal of shared tensors
+    // Check to prevent removal of shared tensors.
     if (CountOpsWithInput(*model, input_name) == 1) {
       model->EraseArray(input_name);
     }
   }
 
-  // Remove concatenate operator
+  // Remove concatenate operator.
   model->operators.erase(concat_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
index d999c2df9483e096f333c6af83e1d9fee873d4d6..debe298a5a93034bcb928d7384b5ec1fc7439e47 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -98,6 +98,16 @@ bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
   CHECK(coords_array.data_type == ArrayDataType::kInt32)
       << "Only int32 indices are supported";
 
+  // Copy min/max info if present. The ranges of the selected values may be
+  // a subset of the original range but we want to ensure the quantization
+  // params stay the same.
+  if (input_array.minmax) {
+    const auto& input_minmax = input_array.GetMinMax();
+    auto& output_minmax = output_array.GetOrCreateMinMax();
+    output_minmax.min = input_minmax.min;
+    output_minmax.max = input_minmax.max;
+  }
+
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7ad383e7789891f5396845241e70143dc8b76f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Resolves a constant reshape operation by copying the buffer.
+bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+  const auto* op = static_cast<const TensorFlowReshapeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  if (!ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
+    AddMessageF("Constant reshape is non-trivial (%s -> %s)",
+                ShapeToString(input_array.shape()),
+                ShapeToString(output_array.shape()));
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (input_array.data_type) {
+    case ArrayDataType::kBool:
+      CopyArrayBuffer<ArrayDataType::kBool>(input_array, &output_array);
+      break;
+    case ArrayDataType::kFloat:
+      CopyArrayBuffer<ArrayDataType::kFloat>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt8:
+      CopyArrayBuffer<ArrayDataType::kInt8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      CopyArrayBuffer<ArrayDataType::kUint8>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt16:
+      CopyArrayBuffer<ArrayDataType::kInt16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint16:
+      CopyArrayBuffer<ArrayDataType::kUint16>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      CopyArrayBuffer<ArrayDataType::kInt32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint32:
+      CopyArrayBuffer<ArrayDataType::kUint32>(input_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      CopyArrayBuffer<ArrayDataType::kInt64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kUint64:
+      CopyArrayBuffer<ArrayDataType::kUint64>(input_array, &output_array);
+      break;
+    case ArrayDataType::kString:
+      CopyArrayBuffer<ArrayDataType::kString>(input_array, &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(input_array.data_type);
+      return false;
+  }
+
+  AddMessageF("Resolving constant reshape of %s", LogName(*op));
+
+  if (input_array.minmax) {
+    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
+
+  // Erase input arrays if no longer used.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 4f984bfde55b3457694bb411bbfdf30723c7066e..1fd20314b14d98bd82e2b20a4e70f5d9c2c3b298 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -131,6 +131,10 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
   if (input_array.minmax) {
     output_array.GetOrCreateMinMax() = input_array.GetMinMax();
   }
+  if (input_array.quantization_params) {
+    output_array.GetOrCreateQuantizationParams() =
+        input_array.GetQuantizationParams();
+  }
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
@@ -164,6 +168,8 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
       break;
   }
 
+  AddMessageF("Resolving constant transpose of %s", LogName(*op));
+
   // Erase input arrays if no longer used.
   for (const auto& input : op->inputs) {
     if (IsDiscardableArray(*model, input) &&
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index d4db6f1c009cd19515655fb31974a2e97cfa42e8..f6c8f79d8d3311dc2294e3ec406a184b2a16a6b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -51,6 +51,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   // Test for unary ops of types that we know how to resolve.
   switch (unary_op->type) {
     case OperatorType::kCast:
+    case OperatorType::kLog:
     case OperatorType::kNeg:
     case OperatorType::kTensorFlowRsqrt:
     case OperatorType::kTensorFlowSqrt:
@@ -218,6 +219,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
     output_float_data[0] = max;
   } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kLog ||
              unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
@@ -231,6 +233,8 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       float outval = 0.f;
       if (unary_op->type == OperatorType::kNeg) {
         outval = -val;
+      } else if (unary_op->type == OperatorType::kLog) {
+        outval = std::log(val);
       } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 876479079b5168b09a4748a3db2077345d363678..155d890c9f23ba206f1f0e6db645a601308cea5b 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -611,6 +611,18 @@ void ConvertRelu6Operator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertLogOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Log");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  auto op = absl::make_unique<LogOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertLogisticOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -682,6 +694,8 @@ void ConvertFakeQuantWithMinMaxArgs(
   minmax.min = GetFloatAttr(node, "min");
   minmax.max = GetFloatAttr(node, "max");
   op->outputs.push_back(node.name());
+  // tf.fake_quant_with_min_max_args num_bits defaults to 8.
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -699,6 +713,7 @@ void ConvertFakeQuantWithMinMaxVars(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -2091,6 +2106,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Log") {
+      ConvertLogOperator(node, tf_import_flags, model);
     } else if (node.op() == "LogSoftmax") {
       ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 9bd72e7de19b75c14b4a383942ae744e3ca0900d..787c20e574b2686329904a30958c1f8c4e5f0e96 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -56,6 +56,7 @@ enum class OperatorType {
   kL2Pool,
   kLstmCell,
   kLocalResponseNormalization,
+  kLog,
   kLogistic,
   kMaxPool,
   kFakeQuant,
@@ -150,9 +151,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
@@ -424,6 +425,7 @@ struct SpaceToDepthOperator : Operator {
 // input activations as a matrix, followed by a MatMul node.
 struct FullyConnectedOperator : Operator {
   FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+  bool experimental_shuffled_weights = false;
 };
 
 // Dequantization operator, converting a quantized array of integers with
@@ -591,6 +593,17 @@ struct LogisticOperator : Operator {
   LogisticOperator() : Operator(OperatorType::kLogistic) {}
 };
 
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
 // Element-wise Tanh operator:
 //   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 //
@@ -712,8 +725,7 @@ struct L2PoolOperator : Operator {
 // The expected [min, max] range of values in a given array.
 // Used for quantization only.
 // This information typically comes from special nodes found in quantized
-// models,
-// see FakeQuantOperator, and is used during quantization to resolve
+// models, see FakeQuantOperator, and is used during quantization to resolve
 // actual quantization parameters (see QuantizationParams).
 struct MinMax {
   double min = 0.;
@@ -741,6 +753,7 @@ inline bool operator==(const MinMax& m1, const MinMax& m2) {
 struct FakeQuantOperator : Operator {
   FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
   std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
 };
 
 // Element-wise division operator.
@@ -1317,6 +1330,15 @@ struct SoftmaxOperator : Operator {
 // TensorFlow equivalent: LogSoftmax
 struct LogSoftmaxOperator : Operator {
   LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
 };
 
 // Cast operator.
@@ -1399,8 +1421,7 @@ struct SpaceToBatchNDOperator : Operator {
 };
 
 // BatchToSpaceND operator. Rearranges data from batch into blocks of
-// spatial data. Currently, only 2-d blocks are supported. Cropping is not
-// supported, either, and the crops array should be all zero.
+// spatial data. Currently, only 2-d blocks are supported.
 //
 // Inputs:
 //   inputs[0]: required: the input array
@@ -1510,7 +1531,7 @@ class Shape {
   int dims(int i) const {
     // Always check for out-of-bounds accesses, even in optimized builds where
     // standard assertions are disabled. Out-of-bounds access here is a common
-    // occurence.
+    // occurrence.
     CHECK_GE(i, 0);
     CHECK_GT(dims_.size(), i);
     return dims_[i];
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index c35b6f99259b762aa83d92d21512169a7ab50b70..3761e0095ebb06b9e26eca55a36718b92058e47b 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -50,6 +50,7 @@ class TocoFromProtosTest(googletest.TestCase):
     toco_flags.output_format = toco_flags_pb2.TFLITE
     toco_flags.inference_input_type = types_pb2.FLOAT
     toco_flags.inference_type = types_pb2.FLOAT
+    toco_flags.allow_custom_ops = True;
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 8a5e483f3f1676ebed3244bd6f7eb610fad21557..153c117d17e4564d7cb0aaea64d792f63a587d91 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -75,7 +75,8 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
   string output_file_contents_txt;
-  Export(toco_flags, *model, &output_file_contents_txt);
+  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+         &output_file_contents_txt);
 
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index f991529569d9ab56103bf7e5f91b2d2b7f2d23fe..d2e14ac5e0d7b06451de295574f42c6139cb97a0 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -68,7 +68,9 @@ class Convolution
     auto activation_function =
         ActivationFunction::Serialize(op.fused_activation_function);
     return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
-                                         op.stride_height, activation_function);
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
   }
 
   void ReadOptions(const TfLiteOptions& options,
@@ -76,6 +78,8 @@ class Convolution
     op->padding.type = Padding::Deserialize(options.padding());
     op->stride_width = options.stride_w();
     op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
@@ -260,12 +264,15 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
                     flexbuffers::Builder* fbb) const override {
     fbb->Float("min", op.minmax->min);
     fbb->Float("max", op.minmax->max);
+    fbb->Int("num_bits", op.num_bits);
   }
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     auto* minmax = new MinMax;
     minmax->min = m["min"].AsFloat();
     minmax->max = m["max"].AsFloat();
     op->minmax.reset(minmax);
+    const auto& num_bits = m["num_bits"];
+    op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
 };
 
@@ -662,6 +669,23 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 };
 
+class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
+                                      ::tflite::BuiltinOptions_ArgMaxOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateArgMaxOptions(
+        *builder, DataType::Serialize(op.output_data_type));
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->output_data_type = DataType::Deserialize(options.output_type());
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -834,6 +858,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
   ops.emplace_back(
       new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
+  ops.emplace_back(
+      new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
 
   // Custom Operators.
   ops.emplace_back(
@@ -871,6 +897,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "LOG_SOFTMAX", OperatorType::kLogSoftmax));
   ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
+  ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum));
+  ops.emplace_back(new SimpleOperator<TensorFlowLessOperator>(
+      "LESS", OperatorType::kTensorFlowLess));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 4783843b7fa1273201e0c31816b3e1be8e98f5d5..36ed741541eadbc9435a67bec15d389ba48350c1 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -111,6 +111,10 @@ TEST_F(OperatorTest, SimpleOperators) {
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum);
+  CheckSimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum);
+  CheckSimpleOperator<TensorFlowLessOperator>("LESS",
+                                              OperatorType::kTensorFlowLess);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -163,10 +167,12 @@ TEST_F(OperatorTest, CustomFakeQuant) {
   minmax->min = -10;
   minmax->max = 200;
   op.minmax.reset(minmax);
+  op.num_bits = 16;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("FAKE_QUANT", OperatorType::kFakeQuant), op);
   EXPECT_EQ(op.minmax->min, output_toco_op->minmax->min);
   EXPECT_EQ(op.minmax->max, output_toco_op->minmax->max);
+  EXPECT_EQ(op.num_bits, output_toco_op->num_bits);
 }
 
 TEST_F(OperatorTest, CustomFullyConnected) {
@@ -391,6 +397,13 @@ TEST_F(OperatorTest, BuiltinTopKV2) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinArgMax) {
+  ArgMaxOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ARG_MAX", OperatorType::kArgMax), op);
+  EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 96c5ebd64f3343d454a9c445a6edcd318b08c953..5ba093a830db2710384430adf37b4cae1aa403d9 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -83,6 +83,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantGather);
   transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
+  transformations->Add(new ResolveConstantReshape);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
   transformations->Add(new ResolveConstantTranspose);
@@ -279,10 +280,13 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
           {new HardcodeMinMax});
     }
     CheckIsReadyForQuantization(*model);
-    RunGraphTransformations(
-        model, "quantization graph transformations",
-        {new Quantize, new RemoveTrivialQuantizedActivationFunc,
-         new RemoveFinalDequantizeOp});
+    RunGraphTransformations(model, "quantization graph transformations",
+                            {
+                                new RemoveTrivialQuantizedActivationFunc,
+                                new RemoveTrivialQuantizedMinMax,
+                                new Quantize,
+                                new RemoveFinalDequantizeOp,
+                            });
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
     // Dequantize creates FakeQuant nodes. We may want to discard
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index b72f5fa2a77279f08013e9b545c188402383575f..224df9973e45a06aa22f84c0e4676eda5acbccb2 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -291,6 +291,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Dequantize)
     HANDLE_OPERATORTYPENAME_CASE(L2Normalization)
     HANDLE_OPERATORTYPENAME_CASE(LocalResponseNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Log)
     HANDLE_OPERATORTYPENAME_CASE(Logistic)
     HANDLE_OPERATORTYPENAME_CASE(LstmCell)
     HANDLE_OPERATORTYPENAME_CASE(MaxPool)
@@ -1083,23 +1084,30 @@ void InsertCopyOperator(Model* model, const string& source_array_name,
   model->operators.emplace_back(copy_op);
 }
 
-namespace {
-template <ArrayDataType A>
-void CopyArrayBuffer(const Array& source_array, Array* target_array) {
-  if (source_array.buffer) {
-    const auto& source_buffer = source_array.GetBuffer<A>();
-    auto& target_buffer = target_array->GetMutableBuffer<A>();
-    target_buffer.data = source_buffer.data;
-  }
-}
-}  // namespace
-
 void CloneArray(Model* model, const string& source_array_name,
                 const string& target_array_name) {
   CHECK(!model->HasArray(target_array_name));
   const Array& source_array = model->GetArray(source_array_name);
   Array& target_array = model->GetOrCreateArray(target_array_name);
 
+  if (source_array.minmax) {
+    const auto& smm = source_array.GetMinMax();
+    auto& tmm = target_array.GetOrCreateMinMax();
+    tmm.min = smm.min;
+    tmm.max = smm.max;
+  }
+
+  if (source_array.quantization_params) {
+    const auto& sqp = source_array.GetQuantizationParams();
+    auto& tqp = target_array.GetOrCreateQuantizationParams();
+    tqp.zero_point = sqp.zero_point;
+    tqp.scale = sqp.scale;
+  }
+
+  target_array.data_type = source_array.data_type;
+  target_array.final_data_type = source_array.final_data_type;
+  target_array.copy_shape(source_array.shape());
+
   switch (source_array.data_type) {
     case ArrayDataType::kBool:
       CopyArrayBuffer<ArrayDataType::kBool>(source_array, &target_array);
@@ -1139,25 +1147,6 @@ void CloneArray(Model* model, const string& source_array_name,
                  << ArrayDataTypeName(source_array.data_type);
       return;
   }
-
-  if (source_array.minmax) {
-    const auto& smm = source_array.GetMinMax();
-    auto& tmm = target_array.GetOrCreateMinMax();
-    tmm.min = smm.min;
-    tmm.max = smm.max;
-  }
-
-  if (source_array.quantization_params) {
-    const auto& sqp = source_array.GetQuantizationParams();
-    auto& tqp = target_array.GetOrCreateQuantizationParams();
-    tqp.zero_point = sqp.zero_point;
-    tqp.scale = sqp.scale;
-  }
-
-  target_array.data_type = source_array.data_type;
-  target_array.final_data_type = source_array.final_data_type;
-
-  target_array.copy_shape(source_array.shape());
 }
 
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index dfd81173c3d3bf31a0ce688ce5434cd37fb959c7..ed0ecd4d0fc7a2952cb44e3786e6f0bc1ef3bd7f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -147,6 +147,23 @@ void FixNoOrphanedArray(Model* model);
 // Fixes input/output arrays that may have issues during export or inference.
 void FixEdgeArrays(Model* model);
 
+// Copies the contents of an array into another.
+// Expects that the shape and data type match.
+template <ArrayDataType A>
+void CopyArrayBuffer(const Array& source_array, Array* target_array) {
+  int source_buffer_size = RequiredBufferSizeForShape(source_array.shape());
+  int target_buffer_size = RequiredBufferSizeForShape(target_array->shape());
+  CHECK_EQ(source_buffer_size, target_buffer_size)
+      << "Buffer sizes must match in element count";
+  CHECK(source_array.data_type == target_array->data_type)
+      << "Data types must match";
+  if (source_array.buffer) {
+    const auto& source_buffer = source_array.GetBuffer<A>();
+    auto& target_buffer = target_array->GetMutableBuffer<A>();
+    target_buffer.data = source_buffer.data;
+  }
+}
+
 // Inserts a no-op reshape operator between the source array and the target
 // array. This effectively just copies the data.
 void InsertCopyOperator(Model* model, const string& source_array_name,
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 44fde69a1e1536b8d2ecff16876248cfe66a9b8a..7b3569ea9c8b15959b15e8ba46cf44d159d5528c 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -78,6 +78,9 @@ cc_test(
         "//tensorflow/contrib/lite:testdata/test_model.bin",
         "//tensorflow/contrib/lite:testdata/test_model_broken.bin",
     ],
+    tags = [
+        "tflite_not_portable_android",
+    ],
     deps = [
         ":gen_op_registration",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index 2b9eee4ef7b418e2b90d388d2f165537b8660a9a..de76acb51ffe985162a66c617b266f47c5216b19 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -711,7 +711,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
       candidate_scores, margin_multiplier * nmi_scores)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   return candidate_ids[argmax_index]
 
@@ -811,7 +811,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index b6acf71b9d446de6f57a7a7f077cc07276db2b17..d4c3f2eda8be0c70e961afe582983b9f73769c77 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -151,6 +151,7 @@ tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
+tensorflow/core/kernels/dense_update_functor.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
 tensorflow/core/kernels/decode_wav_op.cc
@@ -301,3 +302,5 @@ tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
+tensorflow/core/kernels/decode_proto_op.cc
+tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index ff88b4fa841673fc52b9f6fdc5ca43d30c44bbfd..4fe4e8d044bd0b0987c0221ab225f449a71ccfc7 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -348,7 +348,7 @@ def _freeze_graph_with_def_protos(input_graph_def, output_node_names,
                                   input_saver_def, input_checkpoint):
   """Converts all variables in a graph and checkpoint into constants.
 
-  During this process, we need to retain certain initialzer nodes (e.g. table
+  During this process, we need to retain certain initializer nodes (e.g. table
   initializer nodes). Instead of determining which dependencies
   of the shared initializer node (e.g. group_deps) to keep, we
   reconstruct the connections between the individual initializer nodes and
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 088319a5572f346ebb3409f2176e8b3589791f5d..2bf281b7916e296660089234c8487f188a597e5d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2711,7 +2711,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2830,7 +2832,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -2888,7 +2892,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index f50575b2cf311e33f7b7c77488bc94b8d24c70ec..54bd39afacbec07f054f61b72eda0a3654858aa7 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -71,6 +71,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "pruning_utils",
+    srcs = ["python/pruning_utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "pruning",
     srcs = ["python/pruning.py"],
@@ -78,9 +89,20 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":core_layers",
+        ":pruning_utils",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:platform",
-        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "pruning_utils_test",
+    size = "small",
+    srcs = ["python/pruning_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pruning_utils",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 52b659c69fdfc507e6259e928d79c65471f2f025..86f4fd6adf60d8fa54c13989bf4087e28f1e006f 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -45,7 +45,7 @@ The pruning library allows for specification of the following hyper parameters:
 | do_not_prune | list of strings | [""] | list of layers names that are not pruned |
 | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
-| nbins | integer | 255 | Number of bins to use for histogram computation |
+| nbins | integer | 256 | Number of bins to use for histogram computation |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
 | block_width |integer | 1 | Number of cols in a block for block sparse matrices|
 | block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 5146a4a2de7806041991c04958de378b2d3dc810..ea6032e588cf398deaf497fb99087436ce1cb2e8 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -33,12 +33,14 @@
   # Returns a list of all the weight tensors that have been masked
   get_weights()
 
-  The Pruning class uses a proto (defined in pruning.proto) to set up the
-  parameters for a pruning specification. Here's a typical usage:
+  The Pruning class uses a tf.hparams object to set up the
+  parameters for a model pruning. Here's a typical usage:
 
-  # Initialize a pruning spec from a proto
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec)
+  # Parse pruning hyperparameters
+  pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
+
+  # Create a pruning object using the pruning_hparams
+  p = pruning.Pruning(pruning_hparams)
 
   # Add mask update ops to the graph
   mask_update_op = p.conditional_mask_update_op()
@@ -51,24 +53,20 @@
 
   # An object of the pruning also accepts externally defined sparsity:
   sparsity = tf.Variable(0.5, name = "ConstantSparsity")
-  pruning_spec = '/tmp/pruning.pb'
-  p = Pruning(pruning_spec, sparsity=sparsity)
-
+  p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
 """
 # pylint: disable=missing-docstring
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
+from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.contrib.model_pruning.python.layers import core_layers as core
 from tensorflow.contrib.training.python.training import hparam
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
@@ -87,172 +85,18 @@ _WEIGHT_COLLECTION = core.WEIGHT_COLLECTION
 _MASKED_WEIGHT_NAME = core.MASKED_WEIGHT_NAME
 
 
-def _weight_mask_variable(var, scope):
-  """Create a mask for the weights.
-
-  This function adds a variable 'mask' to the graph.
-
-  Args:
-    var: the weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    the mask variable of the same size and shape as var, initialized to all 1s.
-  """
-  with variable_scope.variable_scope(scope):
-    mask = variable_scope.get_variable(
-        'mask',
-        var.get_shape(),
-        initializer=init_ops.ones_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-  return mask
-
-
-def _weight_threshold_variable(var, scope):
-  """Create a scalar threshold for the weights.
-
-  This function adds a variable
-  'threshold' to the graph.
-
-  Args:
-    var: The weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    a scalar threshold variable initialized to 0.
-  """
-  with variable_scope.variable_scope(scope):
-    threshold = variable_scope.get_variable(
-        'threshold', [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-    return threshold
-
-
-def _kronecker_product(mat1, mat2):
-  """Computes the Kronecker product of two matrices mat1 and mat2.
-
-  Args:
-    mat1: A matrix of size m x n
-    mat2: A matrix of size p x q
-  Returns:
-    Kronecker product of matrices mat1 and mat2 of size mp x nq
-  """
-
-  m1, n1 = mat1.get_shape().as_list()
-  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
-  m2, n2 = mat2.get_shape().as_list()
-  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
-  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
-
-
-def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
-  """Return histogram of values.
-
-  Given the tensor `values`, this operation returns a rank 1 histogram counting
-  the number of entries in `values` that fell into every bin.  The bins are
-  equal width and determined by the arguments `value_range` and `nbins`.
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-      values <= value_range[0] will be mapped to hist[0],
-      values >= value_range[1] will be mapped to hist[-1].
-    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
-    dtype:  dtype for returned histogram.
-    name:  A name for this operation (defaults to 'histogram').
-
-  Returns:
-    A 1-D `Tensor` holding histogram of values.
-
-  """
-  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
-    values = ops.convert_to_tensor(values, name='values')
-    values = gen_array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
-    nbins = ops.convert_to_tensor(nbins, dtype=np.int32, name='nbins')
-    nbins_float = math_ops.cast(nbins, values.dtype)
-
-    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(
-        values - value_range[0],
-        value_range[1] - value_range[0],
-        name='scaled_values')
-
-    # map tensor values within the open interval value_range to {0,.., nbins-1},
-    # values outside the open interval will be zero or less, or nbins or more.
-    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
-
-    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
-    indices = math_ops.cast(
-        clip_ops.clip_by_value(indices, 0, nbins_float - 1), np.int32)
-
-    return math_ops.unsorted_segment_sum(
-        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
-
-
-def _determine_partitioned_axis(partitioned_variable):
-  partitioned_axis = 0
-  concatenated_variable_shape = partitioned_variable.get_shape()
-  for partition in partitioned_variable:
-    partition_shape = partition.get_shape()
-    maybe_partitioned_axis = np.less(partition_shape,
-                                     concatenated_variable_shape)
-    # Sanity check: make sure number of partitioned axis == 1
-    if np.count_nonzero(maybe_partitioned_axis) != 1:
-      raise ValueError('Number of partitioned axes %s not equal to 1' %
-                       np.count_nonzero(maybe_partitioned_axis))
-    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
-  return partitioned_axis
-
-
-def _variable_assign(var, new_value):
-  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
-
-
-def _partitioned_variable_assign(partitioned_var, new_value):
-  """Assign op for partitioned variables.
-
-  Args:
-    partitioned_var: A partitioned tensorflow variable
-    new_value: Value to be assigned to the variable var
-
-  Returns:
-    A tensorflow op that groups the assign ops for each of the variable slices
-  """
-  # Determine which axis was used to partition the variable. Currently
-  # tensorflow allows partitioning variable only along 1 axis.
-  axis = 0 if len(partitioned_var) == 1 else _determine_partitioned_axis(
-      partitioned_var)
-
-  partition_sizes = np.array(
-      [partition.get_shape()[axis] for partition in partitioned_var])
-  new_partitioned_values = array_ops.split(
-      new_value,
-      ops.convert_to_tensor(partition_sizes, dtype=np.int32),
-      axis=axis)
-  op_list = []
-  for partition in partitioned_var:
-    op_list.append(
-        _variable_assign(partition, new_partitioned_values[len(op_list)]))
-  return control_flow_ops.group(
-      *op_list, name=partitioned_var.name + '_group_assign')
-
-
 def apply_mask(x, scope=''):
   """Apply mask to a given weight tensor.
 
   Args:
     x: Input weight tensor
-    scope: The current variable scope. Defaults to ""
+    scope: The current variable scope. Defaults to "".
   Returns:
     Tensor representing masked_weights
   """
 
-  mask = _weight_mask_variable(x, scope)
-  threshold = _weight_threshold_variable(x, scope)
+  mask = pruning_utils.weight_mask_variable(x, scope)
+  threshold = pruning_utils.weight_threshold_variable(x, scope)
   # Add masked_weights in the weights namescope so as to make it easier
   # for the quantization library to add quant ops.
   masked_weights = math_ops.multiply(mask, x, _MASKED_WEIGHT_NAME)
@@ -335,6 +179,8 @@ def get_pruning_hparams():
     sparsity_function_exponent: float
       exponent = 1 is linearly varying sparsity between initial and final.
       exponent > 1 varies more slowly towards the end than the beginning
+    use_tpu: False
+      Indicates whether to use TPU
 
     We use the following sparsity function:
 
@@ -357,7 +203,7 @@ def get_pruning_hparams():
       do_not_prune=[''],
       threshold_decay=0.9,
       pruning_frequency=10,
-      nbins=255,
+      nbins=256,
       block_height=1,
       block_width=1,
       block_pooling_function='AVG',
@@ -365,7 +211,8 @@ def get_pruning_hparams():
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
-      sparsity_function_exponent=3)
+      sparsity_function_exponent=3,
+      use_tpu=False)
 
 
 class Pruning(object):
@@ -414,7 +261,7 @@ class Pruning(object):
     if graph_global_step is None:
       graph_global_step = training_util.get_global_step()
 
-    return math_ops.cast(graph_global_step, np.int32)
+    return math_ops.cast(graph_global_step, dtypes.int32)
 
   def _setup_sparsity(self):
     begin_step = self._spec.sparsity_function_begin_step
@@ -429,13 +276,13 @@ class Pruning(object):
           (begin_step, end_step))
 
     with ops.name_scope(self._spec.name):
-      p = math_ops.minimum(1.0,
-                           math_ops.maximum(
-                               0.0,
-                               math_ops.div(
-                                   math_ops.cast(self._global_step - begin_step,
-                                                 np.float32),
-                                   end_step - begin_step)))
+      p = math_ops.minimum(
+          1.0,
+          math_ops.maximum(
+              0.0,
+              math_ops.div(
+                  math_ops.cast(self._global_step - begin_step, dtypes.float32),
+                  end_step - begin_step)))
       sparsity = math_ops.add(
           math_ops.multiply(initial_sparsity - target_sparsity,
                             math_ops.pow(1 - p, exponent)),
@@ -445,17 +292,18 @@ class Pruning(object):
     return sparsity
 
   def _setup_last_update_step(self):
-    with variable_scope.variable_scope(self._spec.name) as scope:
+    with variable_scope.variable_scope(
+        self._spec.name, use_resource=self._spec.use_tpu) as scope:
       try:
         last_update_step = variable_scope.get_variable(
             'last_mask_update_step', [],
             initializer=init_ops.zeros_initializer(),
             trainable=False,
-            dtype=np.int32)
+            dtype=dtypes.int32)
       except ValueError:
         scope.reuse_variables()
         last_update_step = variable_scope.get_variable(
-            'last_mask_update_step', dtype=np.int32)
+            'last_mask_update_step', dtype=dtypes.int32)
     return last_update_step
 
   def _exists_in_do_not_prune_list(self, tensor_name):
@@ -497,18 +345,16 @@ class Pruning(object):
     with ops.name_scope(weights.op.name + '_pruning_ops'):
       abs_weights = math_ops.abs(weights)
       max_value = math_ops.reduce_max(abs_weights)
-      histogram = _histogram(
-          abs_weights, [0.0, max_value],
-          nbins=self._spec.nbins,
-          dtype=np.float32)
+      cdf_fn = pruning_utils.compute_cdf_from_histogram
+      if self._spec.use_tpu:
+        cdf_fn = pruning_utils.compute_cdf
 
-      cdf = math_ops.cumsum(histogram)
-      norm_cdf = math_ops.div(cdf, math_ops.reduce_sum(histogram))
+      norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins)
       current_threshold = math_ops.multiply(
           math_ops.div(
               math_ops.reduce_sum(
                   math_ops.cast(
-                      math_ops.less(norm_cdf, self._sparsity), np.float32)),
+                      math_ops.less(norm_cdf, self._sparsity), dtypes.float32)),
               float(self._spec.nbins)), max_value)
 
       smoothed_threshold = math_ops.add_n([
@@ -516,7 +362,7 @@ class Pruning(object):
           math_ops.multiply(threshold, self._spec.threshold_decay)
       ])
       new_mask = math_ops.cast(
-          math_ops.greater(abs_weights, smoothed_threshold), np.float32)
+          math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32)
     return smoothed_threshold, new_mask
 
   def _maybe_update_block_mask(self, weights, threshold):
@@ -572,8 +418,8 @@ class Pruning(object):
           new_mask,
           [pooled_weights.get_shape()[1],
            pooled_weights.get_shape()[2]])
-      updated_mask = _kronecker_product(reshaped_mask,
-                                        array_ops.ones(self._block_dim))
+      updated_mask = pruning_utils.kronecker_product(
+          reshaped_mask, array_ops.ones(self._block_dim))
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
           [squeezed_weights.get_shape()[0],
@@ -608,11 +454,12 @@ class Pruning(object):
           continue
 
       new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold)
-      self._assign_ops.append(_variable_assign(threshold, new_threshold))
+      self._assign_ops.append(
+          pruning_utils.variable_assign(threshold, new_threshold))
 
       self._assign_ops.append(
-          _partitioned_variable_assign(mask, new_mask)
-          if is_partitioned else _variable_assign(mask, new_mask))
+          pruning_utils.partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else pruning_utils.variable_assign(mask, new_mask))
 
   def mask_update_op(self):
     with ops.name_scope(self._spec.name):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 89e65713197afc6ed37346cb67a6e9be3fa9290f..f80b7c52c000f13b5ce98dd442ff21abfac37761 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -110,12 +110,12 @@ class PruningTest(test.TestCase):
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def _blockMasking(self, hparams, weights, expected_mask):
 
     threshold = variables.Variable(0.0, name="threshold")
-    sparsity = variables.Variable(0.51, name="sparsity")
+    sparsity = variables.Variable(0.5, name="sparsity")
     test_spec = ",".join(hparams)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
 
@@ -138,7 +138,8 @@ class PruningTest(test.TestCase):
     weights_max = constant_op.constant(
         [[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
          [0.0, -0.3, 0.0, -0.4]])
-    expected_mask = [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]
+    expected_mask = [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                     [1., 1., 1., 1.], [1., 1., 1., 1.]]
 
     self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
                        expected_mask)
@@ -155,7 +156,8 @@ class PruningTest(test.TestCase):
     weights_max = constant_op.constant(
         [[[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
           [0.0, -0.3, 0.0, -0.4]]])
-    expected_mask = [[[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]]
+    expected_mask = [[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+                      [1., 1., 1., 1.], [1., 1., 1., 1.]]]
 
     self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
                        expected_mask)
@@ -178,11 +180,12 @@ class PruningTest(test.TestCase):
       masked_weights_val = masked_weights.eval()
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
 
   def testConditionalMaskUpdate(self):
     param_list = [
-        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
+        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6",
+        "nbins=100"
     ]
     test_spec = ",".join(param_list)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56d3dcef20d1b1c34d6b04535e2b4dc7be7f7320
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -0,0 +1,269 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for adding pruning related ops to the graph.
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+_NBINS = 256
+
+
+def weight_mask_variable(var, scope):
+  """Create a mask for the weights.
+
+  This function adds a variable 'mask' to the graph.
+
+  Args:
+    var: the weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    the mask variable of the same size and shape as var, initialized to all 1s.
+  """
+  with variable_scope.variable_scope(scope):
+    mask = variable_scope.get_variable(
+        'mask',
+        var.get_shape(),
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+  return mask
+
+
+def weight_threshold_variable(var, scope):
+  """Create a scalar threshold for the weights.
+
+  This function adds a variable
+  'threshold' to the graph.
+
+  Args:
+    var: The weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    a scalar threshold variable initialized to 0.
+  """
+  with variable_scope.variable_scope(scope):
+    threshold = variable_scope.get_variable(
+        'threshold', [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+    return threshold
+
+
+def kronecker_product(mat1, mat2):
+  """Computes the Kronecker product of two matrices mat1 and mat2.
+
+  Args:
+    mat1: A matrix of size m x n
+    mat2: A matrix of size p x q
+  Returns:
+    Kronecker product of matrices mat1 and mat2 of size mp x nq
+  """
+
+  m1, n1 = mat1.get_shape().as_list()
+  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
+  m2, n2 = mat2.get_shape().as_list()
+  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
+  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
+
+
+def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
+  """Return histogram of values.
+
+  Given the tensor `values`, this operation returns a rank 1 histogram counting
+  the number of entries in `values` that fell into every bin.  The bins are
+  equal width and determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram').
+
+  Returns:
+    A 1-D `Tensor` holding histogram of values.
+
+  """
+  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    values = array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    return math_ops.unsorted_segment_sum(
+        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
+
+
+def compute_cdf_from_histogram(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Computes the histogram and uses tf.cumsum to arrive at cdf
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+    **kwargs: keyword arguments: nbins, name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = kwargs.get('nbins', _NBINS)
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    histogram = _histogram(
+        values, value_range, dtype=dtypes.float32, nbins=nbins)
+    cdf = math_ops.cumsum(histogram)
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def compute_cdf(values, value_range, **kwargs):
+  """Returns the normalized cumulative distribution of the given values tensor.
+
+  Uses tf.while_loop to directly compute the cdf of the values. Number of bins
+  for histogram is fixed at _NBINS=255
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`
+    **kwargs: keyword arguments: name
+
+  Returns:
+    A 1-D `Tensor` holding normalized cdf of values.
+
+  """
+  nbins = _NBINS
+  name = kwargs.get('name', None)
+  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
+    values = ops.convert_to_tensor(values, name='values')
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins_float = np.float32(nbins)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+
+    cdf = array_ops.zeros(nbins)
+    i = constant_op.constant(0)
+
+    def loop_cond(loop_count, _):
+      return math_ops.less(loop_count, nbins)
+
+    def loop_body(loop_count, cdf):
+      temp = math_ops.reduce_sum(
+          math_ops.cast(
+              math_ops.less_equal(indices, loop_count), dtypes.float32))
+      cdf = math_ops.add(
+          cdf,
+          array_ops.one_hot(
+              loop_count, depth=_NBINS, on_value=temp, off_value=0.0))
+      return [loop_count + 1, cdf]
+
+    _, cdf = control_flow_ops.while_loop(
+        loop_cond, loop_body, [i, cdf], maximum_iterations=nbins)
+
+    return math_ops.div(cdf, math_ops.reduce_max(cdf))
+
+
+def determine_partitioned_axis(partitioned_variable):
+  partitioned_axis = 0
+  concatenated_variable_shape = partitioned_variable.get_shape()
+  for partition in partitioned_variable:
+    partition_shape = partition.get_shape()
+    maybe_partitioned_axis = np.less(partition_shape,
+                                     concatenated_variable_shape)
+    # Sanity check: make sure number of partitioned axis == 1
+    if np.count_nonzero(maybe_partitioned_axis) != 1:
+      raise ValueError('Number of partitioned axes %s not equal to 1' %
+                       np.count_nonzero(maybe_partitioned_axis))
+    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
+  return partitioned_axis
+
+
+def variable_assign(var, new_value):
+  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
+
+
+def partitioned_variable_assign(partitioned_var, new_value):
+  """Assign op for partitioned variables.
+
+  Args:
+    partitioned_var: A partitioned tensorflow variable
+    new_value: Value to be assigned to the variable var
+
+  Returns:
+    A tensorflow op that groups the assign ops for each of the variable slices
+  """
+  # Determine which axis was used to partition the variable. Currently
+  # tensorflow allows partitioning variable only along 1 axis.
+  axis = 0 if len(partitioned_var) == 1 else determine_partitioned_axis(
+      partitioned_var)
+
+  partition_sizes = np.array(
+      [partition.get_shape()[axis] for partition in partitioned_var])
+  new_partitioned_values = array_ops.split(
+      new_value,
+      ops.convert_to_tensor(partition_sizes, dtype=dtypes.int32),
+      axis=axis)
+  op_list = []
+  for partition in partitioned_var:
+    op_list.append(
+        variable_assign(partition, new_partitioned_values[len(op_list)]))
+  return control_flow_ops.group(
+      *op_list, name=partitioned_var.name + '_group_assign')
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e1dd0a8eee88f357fbe60bf00f180c05f2c4d2
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions in pruning_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.model_pruning.python import pruning_utils
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class PruningUtilsTest(test.TestCase):
+
+  def testHistogram(self):
+    width = 10
+    height = 10
+    nbins = 100
+    expected_histogram = np.full(nbins, 1.0)
+    init = init_ops.constant_initializer(np.linspace(0.0, 1.0, width * height))
+    weights = variable_scope.get_variable(
+        "weights", [width, height], initializer=init)
+    histogram = pruning_utils._histogram(
+        weights, [0, 1.0], nbins, dtype=np.float32)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      computed_histogram = histogram.eval()
+    self.assertAllEqual(expected_histogram, computed_histogram)
+
+  def testCDF(self):
+    nbins = 5
+    weights = constant_op.constant([-1, 0, 1, 1.5, 2, 3, 4, 5, 10, 100])
+    abs_weights = math_ops.abs(weights)
+    norm_cdf = pruning_utils.compute_cdf_from_histogram(
+        abs_weights, [0.0, 5.0], nbins=nbins)
+    expected_cdf = np.array([0.1, 0.4, 0.5, 0.6, 1.0], dtype=np.float32)
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      norm_cdf_val = sess.run(norm_cdf)
+      self.assertAllEqual(len(norm_cdf_val), nbins)
+      self.assertAllEqual(expected_cdf, norm_cdf_val)
+
+  def _compare_cdf(self, values):
+    abs_values = math_ops.abs(values)
+    max_value = math_ops.reduce_max(abs_values)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
+          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
+      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
+      return cdf.eval(), cdf_from_histogram.eval()
+
+  def testCDFEquivalence2D(self):
+    width = 100
+    height = 100
+    weights = variable_scope.get_variable("weights", shape=[width, height])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+  def testCDFEquivalence4D(self):
+    weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
+    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
+    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 913935b38246f1c5c0f7da4c1ea1f986bc00891b..b9b482a6981e03144c6d00f2a38b71959b4b3621 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -76,6 +76,8 @@ struct NcclManager::Communicator {
 namespace {
 ncclDataType_t ToNcclType(DataType t) {
   switch (t) {
+    case DT_HALF:
+      return ncclHalf;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 985b2bae2566c38dfb2c71a899e4b03bbb8fa55d..06ca65e33ad6f5fb6620144231dd368379dcc190 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -48,35 +48,9 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   return gpus;
 }
 
+template <typename Scalar>
 class NcclManagerTest : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices->empty());
-    LOG(ERROR) << "Running test with " << devices->size() << " gpus";
-  }
-  static void TearDownTestCase() {
-    for (auto device : *devices) delete device;
-    delete devices;
-  }
-
-  static Allocator* gpu_allocator(BaseGPUDevice* device) {
-    return device->GetStepAllocator(AllocatorAttributes(),
-                                    nullptr /* step_resource_manager */);
-  }
-
-  static std::vector<BaseGPUDevice*>* devices;
-
-  template <typename Scalar>
-  perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast<Scalar*>(cuda_memory));
-    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
-    return typed;
-  }
-
+ public:
   // A single all-reduce to apply.
   struct TestCase {
     string key;
@@ -89,42 +63,52 @@ class NcclManagerTest : public ::testing::Test {
     int num_completed = 0;
   };
 
+  static void SetUpTestCase() {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
+    CHECK(!devices_->empty());
+    LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
+  }
+
+  static void TearDownTestCase() {
+    for (auto device : *devices_) delete device;
+    delete devices_;
+  }
+
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
     TestCase* test_case = new TestCase();
-    test_case->expected = Tensor(DT_FLOAT, shape);
+    test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 1; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(1); });
     } else if (reduction_op == ncclSum) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 0; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(0); });
     } else if (reduction_op == ncclMax) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return -1 * std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return -max_; });
     } else if (reduction_op == ncclMin) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return max_; });
     } else {
       LOG(FATAL) << "Invalid reduction_op " << reduction_op;
     }
 
-    int mult = 1;
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
 
-      Tensor in_cpu(DT_FLOAT, shape);
-      test::FillFn<float>(&in_cpu, [mult, value_offset](int index) {
-        return value_offset + (index + 1) * mult;
+      Tensor in_cpu(data_type_, shape);
+      test::FillFn<Scalar>(&in_cpu, [&](int index) {
+        return static_cast<Scalar>((index + 1) * value_scale + value_offset);
       });
       for (int j = 0; j < shape.num_elements(); ++j) {
-        auto in_val = in_cpu.flat<float>()(j);
-        auto out_expr = test_case->expected.flat<float>();
+        auto in_val = in_cpu.flat<Scalar>()(j);
+        auto out_expr = test_case->expected.template flat<Scalar>();
         if (reduction_op == ncclProd) {
-          out_expr(j) *= in_val;
+          out_expr(j) = out_expr(j) * in_val;
         } else if (reduction_op == ncclSum) {
-          out_expr(j) += in_val;
+          out_expr(j) = out_expr(j) + in_val;
         } else if (reduction_op == ncclMax) {
           if (in_val > out_expr(j)) {
             out_expr(j) = in_val;
@@ -136,26 +120,18 @@ class NcclManagerTest : public ::testing::Test {
         }
       }
 
-      mult *= 10;
-      test_case->ins.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
-      test_case->outs.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
+      value_scale *= 10;
+      test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+      test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
 
       const Tensor& in_gpu = test_case->ins.back();
-      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<float>().data());
-      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<float>().data(),
+      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
                          in_cpu.TotalBytes());
     }
     return test_case;
   }
 
-  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
-    return [this, test_case](Status s) {
-      mutex_lock l(test_case->mu);
-      ++test_case->num_completed;
-      test_case->final_status.Update(s);
-    };
-  }
-
   void VerifyResults(const string& case_label, TestCase* test_case) {
     // Wait for the done callback to be called.
     {
@@ -168,41 +144,84 @@ class NcclManagerTest : public ::testing::Test {
       test_case->mu.unlock();
     }
     // Copy memory to host and verify.
-    for (int i = 0; i < test_case->outs.size(); ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < test_case->outs.size(); ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      const Tensor& out_gpu = test_case->outs[i];
-      Tensor out_cpu(DT_FLOAT, out_gpu.shape());
-      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
-      stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
+      const Tensor& out_gpu = test_case->outs[rank];
+      Tensor out_cpu(data_type_, out_gpu.shape());
+      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
+      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
     }
   }
+
+  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
+    return [this, test_case](Status s) {
+      mutex_lock l(test_case->mu);
+      ++test_case->num_completed;
+      test_case->final_status.Update(s);
+    };
+  }
+
+  static BaseGPUDevice* GetDevice(size_t rank) {
+    return devices_->at(rank % devices_->size());
+  }
+
+ private:
+  static Allocator* GpuAllocator(BaseGPUDevice* device) {
+    return device->GetStepAllocator(AllocatorAttributes(),
+                                    nullptr /* step_resource_manager */);
+  }
+
+  static perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
+      const Scalar* cuda_memory) {
+    perftools::gputools::DeviceMemoryBase wrapped(
+        const_cast<Scalar*>(cuda_memory));
+    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+    return typed;
+  }
+
+ private:
+  static std::vector<BaseGPUDevice*>* devices_;
+  static const DataType data_type_;
+  static const Scalar max_;
 };
-std::vector<BaseGPUDevice*>* NcclManagerTest::devices = nullptr;
+
+template <typename Scalar>
+std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+template <typename Scalar>
+const DataType NcclManagerTest<Scalar>::data_type_ =
+    DataTypeToEnum<Scalar>::value;
+template <typename Scalar>
+const Scalar NcclManagerTest<Scalar>::max_ =
+    Eigen::NumTraits<Scalar>::highest();
+
+// Instantiate tests for float and half.
+using TypeList = ::testing::Types<float, Eigen::half>;
+TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
-TEST_F(NcclManagerTest, BasicSumReduction) {
+TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   const int num_ranks = 3;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
-    std::unique_ptr<TestCase> test_case(
-        MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0));
-    for (int device_num = 0; device_num < num_ranks; ++device_num) {
-      auto* device = devices->at(device_num % devices->size());
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
           num_ranks, "allreduce", reduction_op, device->executor(),
-          device->gpu_id(), event_mgr, stream, &test_case->ins[device_num],
-          &test_case->outs[device_num], CreateDoneCallback(test_case.get()));
+          device->gpu_id(), event_mgr, stream, &test_case->ins[rank],
+          &test_case->outs[rank], this->CreateDoneCallback(test_case.get()));
     }
 
     LOG(ERROR) << "Verifying results";
-    VerifyResults("test_case", test_case.get());
+    this->VerifyResults("test_case", test_case.get());
   }
 }
 
@@ -213,7 +232,7 @@ TEST_F(NcclManagerTest, BasicSumReduction) {
 // with num_ranks > devices->size(), for some GPUs (e.g. K20m).
 // To test the higher settings, increase num_ranks,
 // num_collectives_per_iteration and time_limit_micros.
-TEST_F(NcclManagerTest, MultipleCallers) {
+TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 1;                      // 2;
   const int num_collectives_per_iteration = 1;  // 1000;
   const int num_threads = 3;
@@ -223,49 +242,49 @@ TEST_F(NcclManagerTest, MultipleCallers) {
   srand(Env::Default()->NowMicros());
 
   for (;;) {
-    std::vector<std::pair<int, int>> case_and_device_num;
-    std::vector<std::unique_ptr<TestCase>> test_cases;
+    std::vector<std::pair<int, int>> case_and_rank;
+    std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(
-          MakeTestCase(num_ranks, ncclSum,
-                       TensorShape({100, i % 5 + 1, i % 3 + 1}), i + 0.1 * i));
+      test_cases.emplace_back(this->MakeTestCase(
+          num_ranks, ncclSum, TensorShape({100, i % 5 + 1, i % 3 + 1}),
+          1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
-        case_and_device_num.emplace_back(i, j);
+        case_and_rank.emplace_back(i, j);
       }
     }
 
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
-    std::shuffle(case_and_device_num.begin(), case_and_device_num.end(),
+    std::shuffle(case_and_rank.begin(), case_and_rank.end(),
                  std::mt19937(std::random_device()()));
 
-    mutex mu;  // guards case_and_device_num.
+    mutex mu;  // guards case_and_rank.
     std::unique_ptr<thread::ThreadPool> pool(
         new thread::ThreadPool(Env::Default(), "test", num_threads));
-    const int to_schedule = case_and_device_num.size();
+    const int to_schedule = case_and_rank.size();
     for (int i = 0; i < to_schedule; ++i) {
       auto fn = [&]() {
-        int device_num;
+        int rank;
         int test_num;
         {
           mutex_lock l(mu);
-          test_num = case_and_device_num.back().first;
-          device_num = case_and_device_num.back().second;
-          case_and_device_num.pop_back();
+          test_num = case_and_rank.back().first;
+          rank = case_and_rank.back().second;
+          case_and_rank.pop_back();
         }
-        auto* device = devices->at(device_num % devices->size());
+        auto* device = this->GetDevice(rank);
         auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
         auto* stream = device->tensorflow_gpu_device_info()->stream;
-        TestCase* test_case = test_cases[test_num].get();
+        typename TestFixture::TestCase* test_case = test_cases[test_num].get();
         NcclManager::instance()->AddToAllReduce(
             num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
             device->executor(), device->gpu_id(), event_mgr, stream,
-            &test_case->ins[device_num], &test_case->outs[device_num],
-            CreateDoneCallback(test_case));
+            &test_case->ins[rank], &test_case->outs[rank],
+            this->CreateDoneCallback(test_case));
       };
       pool->Schedule(fn);
     }
@@ -274,7 +293,8 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     LOG(ERROR) << "Verifying results for " << num_collectives_per_iteration
                << " collectives";
     for (int i = 0; i < test_cases.size(); ++i) {
-      VerifyResults(strings::StrCat("collective", i), test_cases[i].get());
+      this->VerifyResults(strings::StrCat("collective", i),
+                          test_cases[i].get());
     }
 
     int64 delta = Env::Default()->NowMicros() - start;
diff --git a/tensorflow/contrib/nccl/ops/nccl_ops.cc b/tensorflow/contrib/nccl/ops/nccl_ops.cc
index 8eb804c2e988f313ba1b340217cae20f1f5502c7..a353a34b80add119fcdc8bc4230eddf5a77b30e8 100644
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/ops/nccl_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("NcclAllReduce")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -51,7 +51,7 @@ REGISTER_OP("NcclReduce")
     .Input("input: num_devices * T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -69,7 +69,7 @@ reduction: the reduction operation to perform.
 REGISTER_OP("_NcclReduceSend")
     .Input("input: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -92,7 +92,7 @@ REGISTER_OP("_NcclReduceRecv")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -118,7 +118,7 @@ shared_name: Identifier that is shared between ops of the same reduce.
 REGISTER_OP("NcclBroadcast")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("shape: shape")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -135,7 +135,7 @@ shape: The shape of the input tensor.
 
 REGISTER_OP("_NcclBroadcastSend")
     .Input("input: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -157,7 +157,7 @@ shared_name: Identifier that is shared between ops of the same broadcast.
 REGISTER_OP("_NcclBroadcastRecv")
     .Input("shape: int32")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 98fe394c5b38294700617591992d3207b0a4706b..423a8689aeee062fb58eaf9d6d9b980b0998754e 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,7 +72,7 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
-    for dtype in [np.float32, np.int32, np.int64, np.float64]:
+    for dtype in [np.float16, np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index 85e3e8d3791f2331ed249c0b7f67a3dbde4fca08..ac04ad99110b016b62e091aa10c7f565e5093bc1 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -85,7 +85,7 @@ class MovingAverageOptimizerTest(test.TestCase):
               state_ops.assign_add(ema_var1, [4.0, 4.0])
           ])
 
-          # Test taht saver with missing ema variables will fail.
+          # Test that saver with missing ema variables will fail.
           with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
             opt.swapping_saver(var_list=[var0])
 
@@ -123,7 +123,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
             self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
             self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previou state.
+            # Restore back to previous state.
             train_saver.restore(sess, save_path)
 
           # If updates are parallel, this is not always true after the 1st step.
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 26ea9135f57fb9fe95e61023bccb97d1d4f5ea1c..86e5f4a43725b67cd7dba8152e788b64a5d57d26 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -48,6 +48,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 08f9699e850a6519dbb5de3bbf0d8b8de01c61b2..8ac9b581455f8f4c7af1a66432169ae179de1634 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -24,23 +24,24 @@ import os
 
 import six
 
-from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.optimizer_v2 import adam
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.layers import core
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
 
@@ -372,6 +373,50 @@ class CheckpointingTests(test.TestCase):
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
@@ -411,7 +456,7 @@ class CheckpointingTests(test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
index f37eb48181d6bef195215b86f14f69d3df65a8ac..26724f66c2a1db1d01577b31b739af18f51d3976 100644
--- a/tensorflow/contrib/optimizer_v2/momentum_test.py
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 25d19578ea8c4f53019657ab85950a814d1a47b8..ce15db6f1ec067e5aeb6ddbc8939d2b773692269 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -579,7 +579,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   ### State
 
-  Internal methods apre passed a `state` argument with the correct
+  Internal methods are passed a `state` argument with the correct
   values to use for the slot and non-slot variables, and the hyper
   parameters.
   """
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..046652cbc5a2f11a3e75fdcc7b91ec00be21d300
--- /dev/null
+++ b/tensorflow/contrib/proto/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "proto",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+    ],
+)
diff --git a/tensorflow/python/ops/distributions/bijectors.py b/tensorflow/contrib/proto/__init__.py
similarity index 62%
rename from tensorflow/python/ops/distributions/bijectors.py
rename to tensorflow/contrib/proto/__init__.py
index 69c3a5d4c0ba86586ccb6e55e71d898b1bf7c035..bc5a49de78e251cb4a854fc11a7b13b39820127d 100644
--- a/tensorflow/python/ops/distributions/bijectors.py
+++ b/tensorflow/contrib/proto/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Core module for TensorFlow distribution bijectors."""
+"""Ops and modules related to proto.
+
+@@decode_proto
+@@encode_proto
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.proto.python.ops.decode_proto_op import decode_proto
+from tensorflow.contrib.proto.python.ops.encode_proto_op import encode_proto
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions.bijector import Bijector
-from tensorflow.python.ops.distributions.identity_bijector import Identity
-
-# pylint: enable=wildcard-import,unused-import
 from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Bijector", "Identity"]
-
-remove_undocumented(__name__, _allowed_symbols)
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/proto/python/ops/BUILD b/tensorflow/contrib/proto/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f17065477e1e14d24a16338b1a11d98da44639fe
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+)
+
+py_library(
+    name = "decode_proto_op_py",
+    srcs = ["decode_proto_op.py"],
+    deps = [
+        ":gen_decode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_decode_proto_op_py",
+    out = "gen_decode_proto_op.py",
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "encode_proto_op_py",
+    srcs = ["encode_proto_op.py"],
+    deps = [
+        ":gen_encode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_encode_proto_op_py",
+    out = "gen_encode_proto_op.py",
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/ops/decode_proto_op.py b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc000ebe49724e7571ade500eb29de25be89485
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer decoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_decode_proto_op import decode_proto_v2 as decode_proto
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("DecodeProtoV2")
diff --git a/tensorflow/contrib/proto/python/ops/encode_proto_op.py b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac12198b2e462b73238778e91c1e9b6be156182c
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer encoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_encode_proto_op import encode_proto
+from tensorflow.python.framework import ops
+
+ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 348c824a4072c3329ac4a3441c19c71598bc9c03..c83623ec947c1550991352a9dd9a5c6ee9282290 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -2,14 +2,17 @@
 
 tf.contrib.quantize provides tools for transforming graphs to include ops to
 model quantization of weights, biases and activations during both training and
-inference. This is done using the
+inference. The details of the transformation implemented in this package is
+described here [1].
+
+This is done using the
 [fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
-Recent literature has shown that fixed point networks provide comparable
-performance to floating point networks [1]. This is achieved by modeling the
-quantization operation during training in both the forward and backward passes.
+Literature has shown that fixed point networks provide comparable performance to
+floating point networks [2]. This is achieved by modeling the quantization
+operation during training in both the forward and backward passes.
 The fake quantization operator achieves this by modeling the quantizer as a pass
-through estimator [2]. Note that during back propagation, the parameters are
+through estimator [3]. Note that during back propagation, the parameters are
 updated at high precision as this is needed to ensure sufficient precision in
 accumulating tiny adjustments to the parameters. However, for the forward pass,
 the parameters and activations are quantized to the desired lower precision.
@@ -61,9 +64,11 @@ These rewrites are an active area of research and experimentation, so the
 rewrites and quantized training will likely not work across all models, though
 we hope to work towards generalizing these techniques.
 
+[1] B.Jacob et al., "Quantization and Training of Neural Networks for Efficient
+Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877
 
-[1] P.Gysel, "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
+[2] P.Gysel et al., "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
 NEURAL NETWORKS", https://arxiv.org/pdf/1604.03168.pdf
 
-[2] Y.Bengio, "Estimating or Propagating Gradients Through Stochastic Neurons
-for Conditional Computation", https://arxiv.org/abs/1308.3432
+[3] Y.Bengio et al., "Estimating or Propagating Gradients Through Stochastic
+Neurons for Conditional Computation", https://arxiv.org/abs/1308.3432
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 4a8f8a04cc521d9ee7885b4318814a6f15008eef..aa0ef643088ef36b84596d08f78c29594ceca2d6 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -545,7 +545,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
         gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index a4f7b1b22139588be29171126d43b872d6658168..5c0e17dc8646ce7850e26ffaa80c0201cea456af 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -51,7 +51,6 @@ def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
                       init_max=6.0,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='LastValueQuant',
                       reuse=None,
@@ -69,8 +68,6 @@ def LastValueQuantize(inputs,
       quantization ranges per output channel.
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -133,7 +130,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -146,7 +142,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -163,7 +158,6 @@ def MovingAvgQuantize(inputs,
                       init_min=-6.0,
                       init_max=6.0,
                       ema_decay=0.999,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='MovingAvgQuantize',
                       reuse=None,
@@ -182,8 +176,6 @@ def MovingAvgQuantize(inputs,
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
     ema_decay: EMA decay parameter.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -246,7 +238,6 @@ def MovingAvgQuantize(inputs,
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = moving_averages.assign_moving_average(
         min_var, batch_min, ema_decay, name='AssignMinEma')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -260,7 +251,6 @@ def MovingAvgQuantize(inputs,
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = moving_averages.assign_moving_average(
         max_var, batch_max, ema_decay, name='AssignMaxEma')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index d53d4d7b108e38bec3e2fa4727e85b5ed88f3a9e..d2d0426d233aaadb4ffd0fb222c77ade0a98278c 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
@@ -41,9 +42,16 @@ def Quantize(graph,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES):
+             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+             scope=None):
   """Updates graph with quantization operations.
 
+  Currently we quantize the following tensors:
+  * Conv/MatMul: Quantize the weights if it matches.
+  * Activation: Quantize the output if it matches.
+  * Bypass/Post-activation Bypass: Quantize both input and output
+    if it matches.
+
   Args:
     graph: Graph to modify.
     is_training: Whether quantizing training graph or eval graph.
@@ -57,13 +65,21 @@ def Quantize(graph,
       training.
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
   Raises:
     ValueError: When quantization fails.
   """
+  if scope and not scope.endswith('/'):
+    scope += '/'
+
   input_to_ops_map = input_to_ops.InputToOps(graph)
   for layer_match in _FindLayersToQuantize(graph):
     # Quantize the weights.
     context = _GetContextFromOp(layer_match.layer_op)
+
+    # If `scope` is given, only quantize it if the consumer of weights
+    # (the layer op) is in the right scope.
     _InsertQuantOp(
         context,
         'weights_quant',
@@ -74,7 +90,8 @@ def Quantize(graph,
         quant_delay=quant_delay,
         narrow_range=True,
         vars_collection=vars_collection,
-        bits=weight_bits)
+        bits=weight_bits,
+        consumer_scope=scope)
 
     # Quantize the activations.
     consumer_ops = input_to_ops_map.ConsumerOperations(
@@ -82,6 +99,9 @@ def Quantize(graph,
     add_context = context
     if layer_match.bypass_op:
       add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+
+    # If `scope` is given, only quantize it if the producer of weights
+    # (usually it's the layer op) is in the right scope.
     _InsertQuantOp(
         add_context,
         'act_quant',
@@ -93,11 +113,14 @@ def Quantize(graph,
         quant_delay=quant_delay,
         vars_collection=vars_collection,
         bits=activation_bits,
-        init_min=0.0)
+        init_min=0.0,
+        producer_scope=scope)
 
     # Quantize the inputs and output to the bypass (if it exists). The input to
     # the bypass is the bias add, and the output is the activation.
     if layer_match.bypass_op is not None:
+      # If `scope` is given, only quantize it if the both the producer and the
+      # consumer are in the right scope.
       _InsertQuantOp(
           context,
           'conv_quant',
@@ -107,7 +130,9 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
       _InsertQuantOp(
           add_context,
           'add_quant',
@@ -118,12 +143,16 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
       post_activation_bypass_context = re.search(
           r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
+      # If `scope` is given, only quantize it if the producer is in the right
+      # scope.
       _InsertQuantOp(
           post_activation_bypass_context,
           'post_activation_bypass_quant',
@@ -135,7 +164,8 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
@@ -382,7 +412,9 @@ def _InsertQuantOp(context,
                    ema_decay=0.999,
                    quant_delay=None,
                    vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
-                   narrow_range=False):
+                   narrow_range=False,
+                   producer_scope=None,
+                   consumer_scope=None):
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
   Args:
@@ -407,10 +439,34 @@ def _InsertQuantOp(context,
       quantization interval ends.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+    producer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when the producer is in this scope.
+    consumer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
       consumer operation.
   """
+  if producer_scope and not producer.name.startswith(producer_scope):
+    logging.info(
+        '_InsertQuantOp ignores context="%s" name="%s" '
+        'because producer "%s" is not in scope "%s"',
+        context, name, producer.name, producer_scope)
+    return
+
+  if consumer_scope:
+    consumers_in_scope = []
+    for consumer in consumers:
+      if consumer.name.startswith(consumer_scope):
+        consumers_in_scope.append(consumer)
+      else:
+        logging.info(
+            '_InsertQuantOp context="%s" name="%s" ignores '
+            'consumer "%s" because it is not in scope "%s"',
+            context, name, consumer.name, consumer_scope)
+        return
+    consumers = consumers_in_scope
+
   name_prefix = _AddContextToName(context, name)
   # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
   # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 0b74b438ac317967bbe10ad936b451de6f69d62c..11d052d7f491dc029d1bda9b47364d6e9c880a67 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -28,7 +28,8 @@ def _create_graph(input_graph=None,
                   weight_bits=8,
                   activation_bits=8,
                   quant_delay=None,
-                  freeze_bn_delay=None):
+                  freeze_bn_delay=None,
+                  scope=None):
   """Rewrites an input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
@@ -48,6 +49,8 @@ def _create_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to the number of steps when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -66,7 +69,8 @@ def _create_graph(input_graph=None,
         is_training,
         quant_delay=quant_delay,
         weight_bits=weight_bits,
-        activation_bits=activation_bits)
+        activation_bits=activation_bits,
+        scope=scope)
 
 
 def create_training_graph(input_graph=None, quant_delay=0):
@@ -133,7 +137,8 @@ def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
                                        quant_delay=0,
-                                       freeze_bn_delay=None):
+                                       freeze_bn_delay=None,
+                                       scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -165,6 +170,8 @@ def experimental_create_training_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -177,12 +184,14 @@ def experimental_create_training_graph(input_graph=None,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
       quant_delay=quant_delay,
-      freeze_bn_delay=freeze_bn_delay)
+      freeze_bn_delay=freeze_bn_delay,
+      scope=scope)
 
 
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
-                                   activation_bits=8):
+                                   activation_bits=8,
+                                   scope=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -200,8 +209,8 @@ def experimental_create_eval_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
-
-
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -211,4 +220,5 @@ def experimental_create_eval_graph(input_graph=None,
       input_graph=input_graph,
       is_training=False,
       weight_bits=weight_bits,
-      activation_bits=activation_bits)
+      activation_bits=activation_bits,
+      scope=scope)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index b9d03c1bc059fe7bcce75978f503cbbf76090dbd..caf8ff28d50d2880d491d04c1ed368597519dcd7 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -66,6 +66,20 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for fn in rewrite_fns:
       test_fn(fn)
 
+  def _RunTestOverExperimentalRewritesWithScope(self, test_fn, scope):
+    def with_absent_scope(fn):
+      def fn_with_absent_scope(*args):
+        fn(*args, scope=scope)
+      return fn_with_absent_scope
+    rewrite_fns = [
+        with_absent_scope(
+            quantize_graph.experimental_create_training_graph),
+        with_absent_scope(
+            quantize_graph.experimental_create_eval_graph),
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
   def testRewrite(self):
     self._RunTestOverAllRewrites(self._TestRewrite)
 
@@ -99,6 +113,34 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
+  def testWithPreActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPreActivationBypass)
+
+  def _TestWithPreActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(pre_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(
+          any('scope1/add_quant/' in name for name in op_names))
+
+  def testWithPostActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
+
+  def _TestWithPostActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(post_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(any(
+          'scope1/post_activation_bypass_quant/' in name for name in op_names))
+
   def testQuantDelay(self):
     self._RunTestOverTrainingRewrites(self._TestQuantDelay)
 
@@ -224,20 +266,66 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       graph_def_after = str(g.as_graph_def())
       self.assertEqual(graph_def_before, graph_def_after)
 
-  def _ConvLayer(self):
+  def testRewriteWithScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithScope, 'scope1')
+
+  def _TestRewriteWithScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      scope1_output = self._ConvLayer(scope='scope1')
+      self._ConvLayer(input_tensor=scope1_output, scope='scope2')
+
+    rewrite_fn(graph)
+
+    op_names = [op.name for op in graph.get_operations()]
+    # The weights and activation of scope1 is quantized, but not scope2.
+    self.assertTrue(
+        any('scope1/Conv/act_quant' in name for name in op_names))
+    self.assertTrue(
+        any('scope1/Conv/weights_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/act_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/weights_quant' in name for name in op_names))
+
+  def testRewriteWithNonMatchingScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithNonMatchingScope, 'NonExistingScope')
+
+  def _TestRewriteWithNonMatchingScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._ConvLayer()
+
+    op_names_before_rewrite = set([op.name for op in graph.get_operations()])
+    rewrite_fn(graph)
+    op_names_after_rewrite = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
+
+  def _ConvLayer(
+      self, input_tensor=None, scope='test', pre_activation_bypass=False,
+      post_activation_bypass=False):
     """Add a basic convolution layer to the default graph."""
     batch_size, height, width, depth = 5, 128, 128, 3
-    inputs = array_ops.zeros((batch_size, height, width, depth))
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
     weight_init = init_ops.truncated_normal_initializer
-    conv = layers.conv2d(
-        inputs,
-        32, [5, 5],
-        stride=2,
-        padding='SAME',
-        weights_initializer=weight_init(0.09),
-        activation_fn=None,
-        scope='test')
-    _ = nn_ops.relu6(conv)
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None)
+      if pre_activation_bypass:
+        output += input_tensor
+      output = nn_ops.relu6(output)
+      if post_activation_bypass:
+        output += input_tensor
+    return output
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 8d057d3710579ef3be93ad58a602892a7aa07edf..d37c83d6839f02c52a72cac97c9238c135dc2f66 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -254,12 +254,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     graph = ops.Graph()
     with graph.as_default():
       with graph.name_scope(None):
-        batch_size, height, width, depth = 5, 128, 128, 3
+        batch_size, height, width, depth = 5, 128, 128, 32
         input1 = array_ops.zeros((batch_size, height, width, depth))
         _ = conv2d(
             input1,
             32, [5, 5],
-            stride=2,
             padding='SAME',
             weights_initializer=self._WeightInit(0.09),
             activation_fn=None,
@@ -268,6 +267,33 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
         # Passes if Quantize() does not crash.
 
+  def testWithNonMatchingNameScope(self):
+    self._RunTestOverParameters(self._testWithNonMatchingNameScope)
+
+  def _testWithNonMatchingNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+    op_names_before_quantize = set([op.name for op in graph.get_operations()])
+    quantize.Quantize(
+        graph, is_training, weight_bits=8, activation_bits=8,
+        scope='NonExisting/')
+    op_names_after_quantize = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_quantize, op_names_after_quantize)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/recurrent/BUILD b/tensorflow/contrib/recurrent/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b3cb04ce26d96333f516f1298c8d5c331964f05b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/BUILD
@@ -0,0 +1,106 @@
+# Recurrent library.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "recurrent_py",
+    srcs = ["python/recurrent_api.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":functional_rnn_ops_py",
+        ":recurrent_ops_py",
+    ],
+)
+
+py_library(
+    name = "recurrent_ops_py",
+    srcs = ["python/ops/recurrent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "functional_rnn_ops_py",
+    srcs = ["python/ops/functional_rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":recurrent_ops_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "recurrent_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/recurrent_test.py"],
+    additional_deps = [
+        ":recurrent_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
+
+cuda_py_tests(
+    name = "functional_rnn_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/functional_rnn_test.py"],
+    additional_deps = [
+        ":functional_rnn_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
diff --git a/tensorflow/contrib/recurrent/README.md b/tensorflow/contrib/recurrent/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..86e10eee517f69d1316b76af85cb13a2bfea2984
--- /dev/null
+++ b/tensorflow/contrib/recurrent/README.md
@@ -0,0 +1,13 @@
+# Recurrent computation library
+
+The recurrent computation library contains code to perform recurrent
+computations.
+
+Its chief application is to implement recurrent neural networks (RNNs, LSTMs,
+etc), which is implemented in `functional_rnn.py`. Similar techniques may be
+used to implement deep networks.
+
+The computation saves the activations in the forward pass, and computes the
+gradients in the backward pass using a single accumulator.
+
+The `functional_rnn` interface is compatible with the `dynamic_rnn` API.
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f19ac7dbe0cee2eb6c780ec5ea6266bc847abd7
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Functional RNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import rnn as rnn_lib
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _CreateStackedLstmCell(*cell_sizes):
+  subcells = [rnn_cell_impl.LSTMCell(cell_size) for cell_size in cell_sizes]
+  return rnn_cell_impl.MultiRNNCell(subcells)
+
+
+class FunctionalRnnTest(test_util.TensorFlowTestCase):
+
+  _BATCH_SIZE = 3
+  _TOTAL_TIME = 5
+  _INPUT_SIZE = 11
+  _NUM_UNITS = 7
+
+  # Set this to some output if you want to use it.
+  _LSTM_GRAPH_DEF_FILEPATH = None
+
+  _CELLDEFS = {
+      'gru': (rnn_cell_impl.GRUCell, [_NUM_UNITS]),
+      'lstm': (rnn_cell_impl.LSTMCell, [_NUM_UNITS]),
+      'stacked_lstm': (_CreateStackedLstmCell, [_NUM_UNITS] * 3)
+  }
+
+  def _CreateCell(self, celldef_name):
+    func, args = self._CELLDEFS[celldef_name]
+    return func(*args)
+
+  def _CreateInputs(self):
+    inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE,
+                               FunctionalRnnTest._TOTAL_TIME,
+                               FunctionalRnnTest._INPUT_SIZE])
+    # Always leave one time slot empty, to check max_length behavior.
+    sequence_length = np.random.randint(
+        0, high=FunctionalRnnTest._TOTAL_TIME - 1,
+        size=FunctionalRnnTest._BATCH_SIZE,
+        dtype=np.int)
+    return (inputs, sequence_length)
+
+  def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs,
+                      tf_sequence_length, initial_state=None,
+                      time_major=None, scope=None):
+    tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs,
+                                            sequence_length=tf_sequence_length,
+                                            initial_state=initial_state,
+                                            dtype=dtypes.float32,
+                                            time_major=time_major,
+                                            scope=scope)
+    grad = gradients_impl.gradients(tf_result, variables.trainable_variables())
+    return {'inference': tf_result, 'grad': grad}
+
+  def _MaybeResetVariables(self, variable_cache, sess, var_list):
+    """Possibly resets the variables to a previously seen value."""
+    reset_ops = []
+    fetches = []
+    for var in var_list:
+      if var.name in variable_cache:
+        reset_ops += [var.assign(variable_cache[var.name])]
+      else:
+        fetches += [(var.name, var)]
+    if reset_ops:
+      sess.run(reset_ops)
+    if fetches:
+      val = sess.run(dict(fetches))
+      for n, v in val.items():
+        assert n not in variable_cache
+        variable_cache[n] = v
+
+  def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache,
+              is_dynamic):
+    with ops.Graph().as_default() as graph:
+      tf_inputs = array_ops.placeholder(
+          dtypes.float32, shape=numpy_inputs.shape)
+      tf_slen = array_ops.placeholder(dtypes.int32)
+      feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen}
+      cell = self._CreateCell(cell_name)
+      fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn
+      fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen)
+      with self.test_session(graph=graph) as sess:
+        sess.run(variables.global_variables_initializer())
+        # Note that cell.trainable_variables it not always set.
+        self._MaybeResetVariables(variable_cache, sess,
+                                  variables.trainable_variables())
+        val = sess.run(fetches, feed_dict=feeds)
+      graph_def = graph.as_graph_def()
+      return graph_def, val
+
+  def testRunLstm(self):
+    """Runs a simple LSTM. Does not check output."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    graphdef, _ = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    logging.info('graphdef: %s', graphdef)
+    if self._LSTM_GRAPH_DEF_FILEPATH:
+      with open(self._LSTM_GRAPH_DEF_FILEPATH, 'w') as f:
+        f.write(str(graphdef))
+
+  def testLstm(self):
+    """Checks an LSTM against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testGru(self):
+    """Checks a GRU cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testStackedLstm(self):
+    """Checks a stacked LSTM cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    args = [np_inputs, np_slen, 'stacked_lstm', var_cache]
+    _, func_rnn = self._RunRnn(*(args + [False]))
+    _, dyn_rnn = self._RunRnn(*(args + [True]))
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fbd4fbb8205ceb649616050314e400be1785a5
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Recurrent ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_ElmanState = collections.namedtuple('ElmanState', ('h'))
+_ElmanTheta = collections.namedtuple('ElmanTheta', ('w', 'b'))
+_ElmanInputs = collections.namedtuple('ElmanInputs', ('x'))
+
+
+# TODO(drpng): add test for max length computation.
+class RecurrentTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    # pylint:disable=invalid-name
+    _PolyState = collections.namedtuple('PolyState', ('value', 'x_power'))
+    _PolyTheta = collections.namedtuple('PolyTheta', ('x'))
+    _PolyInputs = collections.namedtuple('PolyInputs', ('coeff'))
+    # pylint:enable=invalid-name
+
+    def Poly(theta, state, inputs):
+      next_state = _PolyState(
+          value=state.value + inputs.coeff * state.x_power,
+          x_power=state.x_power * theta.x)
+      return next_state, []
+
+    with self.test_session() as sess:
+      theta = _PolyTheta(x=array_ops.constant(2.0))
+      state = _PolyState(
+          value=array_ops.constant(0.0),
+          x_power=array_ops.constant(1.0))
+      inputs = _PolyInputs(coeff=array_ops.constant([1., 2., 3.]))
+
+      # x = 2
+      # 1 + 2*x + 3*x^2
+      ret = recurrent.Recurrent(theta, state, inputs, Poly)
+
+      acc, state = sess.run(ret)
+      self.assertAllClose(acc.value, [1., 5., 17.])
+      self.assertAllClose(acc.x_power, [2., 4., 8.])
+      self.assertAllClose(state.value, 17.)
+      self.assertAllClose(state.x_power, 8.)
+
+      y = ret[1].value
+      dx, d_coeff = gradients_impl.gradients(ys=[y], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+
+      # 2 + 6*x
+      self.assertAllClose(dx_val, 14.)
+      self.assertAllClose(d_coeff_val, [1., 2., 4.])
+
+      # acc = [1, 1+2x, 1+2x+3x^2]
+      # sum(acc) = 3 + 4x + 3x^2
+      acc = ret[0].value
+      dx, d_coeff = gradients_impl.gradients(
+          ys=[math_ops.reduce_sum(acc)], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+      # 4 + 6*x
+      self.assertAllClose(dx_val, 16.)
+      self.assertAllClose(d_coeff_val, [3., 4., 4.])
+
+  @staticmethod
+  def Rand(shape):
+    return random_ops.random_uniform(
+        shape, minval=-0.2, maxval=0.2, dtype=dtypes.float64)
+
+  @staticmethod
+  def Elman(theta, state0, inputs):
+    h0, w, b, x = state0.h, theta.w, theta.b, inputs.x
+    xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+    h1 = math_ops.sigmoid(xw + b)
+    state1 = _ElmanState(h=h1)
+    return (state1, state1)
+
+  @staticmethod
+  def ElmanGrad(theta, state0, inputs, extras, dstate1):
+
+    @function.Defun()
+    def Grad(h0, w, b, x, h1, dh1):
+      del b
+      # We hand-roll the gradient for the 2nd half of the cell as a demo.
+      dxwb = (dh1 * (1 - h1) * h1)
+      dxw, db = dxwb, math_ops.reduce_sum(dxwb, axis=0)
+
+      # Uses tf.gradient for the 1nd half of the cell as a demo.
+      xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+      dh0, dx, dw = gradients_impl.gradients(
+          ys=[xw], xs=[h0, x, w], grad_ys=[dxw])
+
+      return dh0, dx, dw, db
+
+    dh0, dx, dw, db = Grad(state0.h, theta.w, theta.b, inputs.x,
+                           extras.h, dstate1.h)
+    dstate0 = _ElmanState(h=dh0)
+    dinputs = _ElmanInputs(x=dx)
+    return (_ElmanTheta(w=dw, b=db), dstate0, dinputs)
+
+  @staticmethod
+  def ElmanOut(state1):
+    return _ElmanState(x=state1.h)
+
+  @staticmethod
+  def ElmanOutGrad(dout):
+    return _ElmanState(h=dout.x)
+
+  def testElman(self):
+    for seqlen, use_grad in [(1, False), (1, True), (7, False), (7, True)]:
+      logging.info('== Elman: seqlen=%s, use_grad=%s', seqlen, use_grad)
+      self._ParameterizedTestElman(seqlen, use_grad)
+
+  def _ParameterizedTestElman(self, seqlen, use_grad):
+
+    with self.test_session() as sess:
+      random_seed.set_random_seed(342462)
+
+      batch = 3
+      dims = 4
+      theta = _ElmanTheta(w=RecurrentTest.Rand([2 * dims, dims]),
+                          b=RecurrentTest.Rand([dims]))
+      state0 = _ElmanState(h=RecurrentTest.Rand([batch, dims]))
+      inputs = _ElmanInputs(x=RecurrentTest.Rand([seqlen, batch, dims]))
+
+      # Statically unrolled.
+      s = state0
+      out = []
+      for i in xrange(seqlen):
+        inp = _ElmanInputs(x=inputs.x[i, :])
+        s, _ = RecurrentTest.Elman(theta, s, inp)
+        out += [s.h]
+      acc0, final0 = array_ops.stack(out), s.h
+      loss0 = math_ops.reduce_sum(acc0) + math_ops.reduce_sum(final0)
+      (dw0, db0, dh0, di0) = gradients_impl.gradients(
+          loss0, [theta.w, theta.b, state0.h, inputs.x])
+
+      acc1, final1 = recurrent.Recurrent(
+          theta=theta,
+          state0=state0,
+          inputs=inputs,
+          cell_fn=RecurrentTest.Elman,
+          cell_grad=RecurrentTest.ElmanGrad if use_grad else None)
+      assert isinstance(acc1, _ElmanState)
+      assert isinstance(final1, _ElmanState)
+      acc1, final1 = acc1.h, final1.h
+      loss1 = math_ops.reduce_sum(acc1) + math_ops.reduce_sum(final1)
+      (dw1, db1, dh1, di1) = gradients_impl.gradients(
+          loss1, [theta.w, theta.b, state0.h, inputs.x])
+
+      # Fetches a few values and compare them.
+      (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
+       di1) = sess.run(
+           [acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1])
+      self.assertAllClose(acc0, acc1)
+      self.assertAllClose(final0, final1)
+      self.assertAllClose(dw0, dw1)
+      self.assertAllClose(db0, db1)
+      self.assertAllClose(dh0, dh1)
+      self.assertAllClose(di0, di1)
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a085474c1bf6117ba5663139c78d8f08f71392d3
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -0,0 +1,396 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tf.nn.dynamic_rnn variant, built on the Recurrent class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _GetDTypesFromStructure(struct):
+  dtypes_list = []
+  for x in nest.flatten(struct):
+    x = ops.convert_to_tensor(x)
+    dtypes_list.append(x.dtype)
+  return dtypes_list
+
+
+def _SetShapeFromTemplate(struct, struct_template):
+  as_list = nest.flatten(struct)
+  template_as_list = nest.flatten(struct_template)
+  for element, template in zip(as_list, template_as_list):
+    element.set_shape(template.shape)
+
+
+class _FunctionalRnnCell(object):
+  """Wrapper around RNNCell which separates state from computation.
+
+  This class accomplishes the following:
+  * Turn the cell's `__call__` function into a pure function. The global
+    side effects are separated as `theta`. They are the variables created
+    for the weights of the computation.
+  * Unless the output is aliased as part of the state, extend the state to
+    contain the output so that we store the history in `Recurrent`.
+  * Set static shapes as required.
+  """
+
+  def __init__(self, rnn_cell, seq_inputs, initial_state):
+    assert initial_state is not None
+
+    # TODO(drpng): Dtype needs to be configurable.
+    input_dtypes = [dtypes.float32] + _GetDTypesFromStructure(initial_state)
+    # See _index.
+    like_inputs_t = nest.map_structure(
+        lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)), seq_inputs)
+    input_structure = (like_inputs_t, initial_state)
+
+    @function.Defun(*input_dtypes)
+    def FlatCellStep(*flat_inputs):
+      """The flattened version of `rnn_cell`."""
+      inputs_t, state0 = nest.pack_sequence_as(input_structure, flat_inputs)
+      _SetShapeFromTemplate(state0, initial_state)
+      _SetShapeFromTemplate(inputs_t, like_inputs_t)
+      outputs_t, state1 = rnn_cell(inputs_t, state0)
+      state_list = nest.flatten(state1)
+      self._output_shape = outputs_t.shape
+
+      if outputs_t in state_list:
+        output_index_in_state = state_list.index(outputs_t)
+      else:
+        output_index_in_state = None
+
+      if output_index_in_state is None:
+        self._prepend_output = True
+        self._output_state_idx = 0
+        return [outputs_t] + state_list
+      else:
+        self._output_state_idx = output_index_in_state
+        self._prepend_output = False
+        # To save memory, we don't store return the output separately
+        # from the state list, since we know it's the same.
+        return state_list
+
+    def _ToPureFunction(func):
+      # NOTE: This forces the creating of the function.
+      if func.captured_inputs:
+        pure_func = copy.copy(func)
+        # pylint: disable=protected-access
+        pure_func._extra_inputs = []
+        return pure_func
+      return func
+
+    pure_flat_cell_step = _ToPureFunction(FlatCellStep)
+
+    def CellStep(theta, extended_state0, inputs_t):
+      """Performs one time steps on structured inputs.
+
+      The purpose of this function is to turn the parameters into flattened
+      versions, and to resolve the parameter order difference between
+      `Recurrent` and `RNNCell`.
+
+      In the event the cell returns a transformed output that is not aliased
+      within its state, the `extended_state0` also contains the output as its
+      first element.
+
+      Args:
+        theta: Weights required for the computation. A structure of tensors.
+        extended_state0: the state0, and possibly the output at the previous
+          time step. A structure of tensors.
+        inputs_t: the inputs at time t.
+
+      Returns:
+        A pair of the next state (inclusive of the output), and an empty list
+        (unused `extras`).
+        The next state is congruent to state0.
+      """
+      extended_state0_flat = nest.flatten(extended_state0)
+      state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat)
+      full_inputs = [inputs_t] + state0_flat + theta
+      # Note that the thetas are additional inputs appeneded as extra
+      # parameters.
+      cell_out = pure_flat_cell_step(*full_inputs)
+      return cell_out, []
+
+    self._cell_step = CellStep
+    self._theta = FlatCellStep.captured_inputs
+    self._zero_state = rnn_cell.zero_state
+    self._state_template = initial_state
+    self._output_size = rnn_cell.output_size
+
+  @property
+  def extended_initial_state(self):
+    if self._prepend_output:
+      return [array_ops.zeros(self._output_shape), self._state_template]
+    else:
+      # The base case, where the output is just the hidden state.
+      return self._state_template
+
+  @property
+  def cell_step(self):
+    return self._cell_step
+
+  @property
+  def theta(self):
+    return self._theta
+
+  @property
+  def state_template(self):
+    return self._state_template
+
+  @property
+  def output_shape(self):
+    return self._output_shape
+
+  def GetOutputFromState(self, state):
+    return nest.flatten(state)[self._output_state_idx]
+
+  def MaybeRemoveOutputFromState(self, flat_state):
+    if self._prepend_output:
+      return flat_state[1:]
+    return flat_state
+
+
+def _ApplyLengthsToBatch(sequence_lengths, tf_output):
+  # TODO(drpng): just use Update so that we don't carry over the gradients?
+  """Sets the output to be zero at the end of the sequence."""
+  # output is batch major.
+  batch_size, max_time, vector_size = tf_output.shape
+  output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+  output_time = array_ops.reshape(output_time, [batch_size, max_time])
+  lengths = array_ops.tile(
+      array_ops.reshape(sequence_lengths, [-1, 1]), [1, max_time])
+  is_less = math_ops.cast(
+      math_ops.less(output_time, lengths), dtype=dtypes.float32)
+  keep_mask = array_ops.tile(
+      array_ops.expand_dims(is_less, -1),
+      [1, 1, vector_size])
+  final_output = keep_mask * tf_output
+  return final_output
+
+
+def _PickFinalStateFromHistory(acc_state, sequence_length):
+  """Implements acc_state[sequence_length - 1]."""
+  # This will work on all platforms, unlike the regular slice.
+  last_value = []
+  for state_var in nest.flatten(acc_state):
+    # We compute the following with matrix operations:
+    # last_var = state_var[sequence_length - 1]
+    shape = array_ops.shape(state_var)
+    max_time, batch_size = shape[0], shape[1]
+    output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+    output_time = array_ops.reshape(output_time, [batch_size, max_time])
+    lengths = array_ops.tile(array_ops.reshape(sequence_length,
+                                               [-1, 1]), [1, max_time])
+    last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1),
+                             dtype=dtypes.float32)
+    last_idx = array_ops.transpose(last_idx)
+    last_idx_for_bcast = array_ops.expand_dims(last_idx, -1)
+    sliced = math_ops.multiply(last_idx_for_bcast, state_var)
+    last_var = math_ops.reduce_sum(sliced, 0)
+    last_value += [last_var]
+  return nest.pack_sequence_as(acc_state, last_value)
+
+
+def _PostProcessOutput(extended_acc_state, extended_final_state, func_cell,
+                       total_time, inputs_lengths):
+  """Post-process output of recurrent.
+
+  This function takes the accumulated extended state and extracts the requested
+  state and output.
+
+  When `inputs_lengths` has been set, it extracts the output from the
+  accumulated state. It also sets outputs past.
+
+  It also sets the static shape information.
+
+  Args:
+    extended_acc_state: A structure containing the accumulated state at each
+      time. It may contain the output at each time as well.
+    extended_final_state: A structure containing the final state. It may
+      contain the output at the final time.
+    func_cell: The functional wrapper around the cell.
+    total_time: A scalar integer tensor.
+    inputs_lengths: An integer tensor with one entry per input.
+
+  Returns:
+    A tuple with the outputs at each time, and the final state.
+  """
+  if inputs_lengths is None:
+    flat_final_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_final_state))
+    tf_state = nest.pack_sequence_as(func_cell.state_template, flat_final_state)
+  else:
+    # The accumulated state is over the entire sequence, so we pick it
+    # out from the acc_state sequence.
+    flat_acc_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_acc_state))
+    acc_state = nest.pack_sequence_as(
+        func_cell.state_template, flat_acc_state)
+    tf_state = _PickFinalStateFromHistory(acc_state, inputs_lengths)
+
+  output_from_state = func_cell.GetOutputFromState(extended_acc_state)
+  tf_output = array_ops.transpose(output_from_state, [1, 0, 2])
+  tf_output.set_shape(
+      [func_cell.output_shape[0], total_time, func_cell.output_shape[1]])
+  if inputs_lengths is not None:
+    # Need set the outputs to zero.
+    tf_output = _ApplyLengthsToBatch(inputs_lengths, tf_output)
+    # tf_output = array_ops.zeros([4, 3, 5])
+  _SetShapeFromTemplate(tf_state, func_cell.state_template)
+  return tf_output, tf_state
+
+
+# pylint: disable=invalid-name
+def functional_rnn(cell, inputs, sequence_length=None,
+                   initial_state=None, dtype=None, time_major=False,
+                   scope=None, use_tpu=False):
+  """Same interface as `tf.nn.dynamic_rnn`."""
+  with variable_scope.variable_scope(scope or 'rnn'):
+    if not time_major:
+      inputs = nest.map_structure(
+          lambda t: array_ops.transpose(t, [1, 0, 2]), inputs)
+    inputs_flat = nest.flatten(inputs)
+    batch_size = array_ops.shape(inputs_flat[0])[1]
+    if initial_state is None:
+      initial_state = cell.zero_state(batch_size, dtype)
+    func_cell = _FunctionalRnnCell(cell, inputs, initial_state)
+  extended_acc_state, extended_final_state = recurrent.Recurrent(
+      theta=func_cell.theta,
+      state0=func_cell.extended_initial_state,
+      inputs=inputs,
+      cell_fn=func_cell.cell_step,
+      use_tpu=use_tpu)
+  return _PostProcessOutput(extended_acc_state, extended_final_state,
+                            func_cell, inputs_flat[0].shape[0], sequence_length)
+
+
+def bidirectional_functional_rnn(
+    cell_fw,
+    cell_bw,
+    inputs,
+    initial_state_fw=None,
+    initial_state_bw=None,
+    dtype=None,
+    sequence_length=None,
+    time_major=False,
+    use_tpu=False,
+    scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Performs fully dynamic unrolling of inputs in both directions. Built to be API
+  compatible with `tf.nn.bidirectional_dynamic_rnn`, but implemented with
+  functional control flow for TPU compatibility.
+
+  Args:
+    cell_fw: An instance of `tf.contrib.rnn.RNNCell`.
+    cell_bw: An instance of `tf.contrib.rnn.RNNCell`.
+    inputs: The RNN inputs. If time_major == False (default), this must be a
+      Tensor (or hierarchical structure of Tensors) of shape
+      [batch_size, max_time, ...]. If time_major == True, this must be a Tensor
+      (or hierarchical structure of Tensors) of shape:
+      [max_time, batch_size, ...]. The first two dimensions must match across
+      all the inputs, but otherwise the ranks and other shape components may
+      differ.
+    initial_state_fw: An optional initial state for `cell_fw`. Should match
+      `cell_fw.zero_state` in structure and type.
+    initial_state_bw: An optional initial state for `cell_bw`. Should match
+      `cell_bw.zero_state` in structure and type.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_states are not provided or RNN state has a
+      heterogeneous dtype.
+    sequence_length: An optional int32/int64 vector sized [batch_size]. Used to
+      copy-through state and zero-out outputs when past a batch element's
+      sequence length. So it's more for correctness than performance.
+    time_major: Whether the `inputs` tensor is in "time major" format.
+    use_tpu: Whether to enable TPU-compatible operation. If True, does not truly
+      reverse `inputs` in the backwards RNN. Once b/69305369 is fixed, we can
+      remove this flag.
+    scope: An optional scope name for the dynamic RNN.
+
+  Returns:
+    outputs: A tuple of `(output_fw, output_bw)`. The output of the forward and
+      backward RNN. If time_major == False (default), these will
+      be Tensors shaped: [batch_size, max_time, cell.output_size]. If
+      time_major == True, these will be Tensors shaped:
+      [max_time, batch_size, cell.output_size]. Note, if cell.output_size is a
+      (possibly nested) tuple of integers or TensorShape objects, then the
+      output for that direction will be a tuple having the same structure as
+      cell.output_size, containing Tensors having shapes corresponding to the
+      shape data in cell.output_size.
+    final_states: A tuple of `(final_state_fw, final_state_bw)`. A Tensor or
+      hierarchical structure of Tensors indicating the final cell state in each
+      direction. Must have the same structure and shape as cell.zero_state.
+
+  Raises:
+    ValueError: If `initial_state_fw` is None or `initial_state_bw` is None and
+      `dtype` is not provided.
+  """
+  # Keep this code in sync with tf.nn.dynamic_rnn for compatibility.
+  with variable_scope.variable_scope(scope or 'bidirectional_rnn'):
+    # Forward direction
+    with variable_scope.variable_scope('fw') as fw_scope:
+      output_fw, output_state_fw = functional_rnn(
+          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+          initial_state=initial_state_fw, dtype=dtype,
+          time_major=time_major, scope=fw_scope, use_tpu=use_tpu)
+    # Backward direction
+    if not time_major:
+      time_dim = 1
+      batch_dim = 0
+    else:
+      time_dim = 0
+      batch_dim = 1
+
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        # See b/69305369.
+        assert not use_tpu, (
+            'Bidirectional with variable sequence lengths unsupported on TPU')
+        return array_ops.reverse(input_, axis=[seq_dim])
+
+    with variable_scope.variable_scope('bw') as bw_scope:
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
+          seq_dim=time_dim, batch_dim=batch_dim)
+      tmp, output_state_bw = functional_rnn(
+          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+          initial_state=initial_state_bw, dtype=dtype,
+          time_major=time_major, scope=bw_scope, use_tpu=use_tpu)
+
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
+      seq_dim=time_dim, batch_dim=batch_dim)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
+# pylint: enable=invalid-name
diff --git a/tensorflow/contrib/recurrent/python/ops/recurrent.py b/tensorflow/contrib/recurrent/python/ops/recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa16b82ab62f27d034c3ca7584e7e1ca14be6f9b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/recurrent.py
@@ -0,0 +1,720 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent computation.
+
+The main interface of this module is Recurrent().
+A recurrent computation describes an auto-regressive process, where outputs
+of one time step are fed to the output of the next time step.
+
+This module uses:
+  theta: the "weights" each RNN uses.
+  state0: the initial state of each RNN.
+  cell_fn: A python function describing RNN cell. It must has the following
+    signature:
+         cell_fn: (theta, state0, inputs) -> (state1, extras)
+    state1 is the next RNN state, extras are computed by cell_fn
+    and the library forwards extras to cell_fn's gradient function.
+  cell_grad: A python function describing the backprop gradient function
+    for the RNN cell. It must has the following signature:
+         cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+    dstate1 is what the backprop algorithm provides representing
+    gradients of state1 w.r.t. the final loss.
+
+In this module, we handle structures of tensors for theta, state0, inputs,
+and extras. The structure is an arbitrarily nested python structure, such
+as a dictionary of named tuples.
+
+Because the computation is a left-to-right chain, a single in-place accumulator
+can be used rather than a stack. Thus a special gradient was written to reduce
+unnecessary memory usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.inplace_ops import alias_inplace_update
+from tensorflow.python.util import nest
+
+
+def _AssertIsCompatible(a, b):
+  """Checks that `a` and `b` are nested structures of the same type."""
+  # TODO(drpng): implement.
+  del a
+  del b
+
+
+def _Index(struct, index):
+  """Returns a structure with `x[index]` for each tensor `x` in the structure.
+
+  Args:
+    struct: A structure of tensors.
+    index: A scalar integer tensor. Performance is better if `index` is
+      on the host memory.
+
+  Returns:
+    A structure of tensors congruent to `struct`.
+    For each key in `ret`, `rets[key] = struct[key][index]`.
+  """
+  index = ops.convert_to_tensor(index)
+  index.get_shape().assert_has_rank(0)
+  return nest.map_structure(lambda x: x[index], struct)
+
+
+def _Update(struct_acc, struct_x, t):
+  """Updates t-th row in accumulators.
+
+  Args:
+    struct_acc: The accumulators. A structure of tensors.
+    struct_x: The new values. A structure of tensors congruent to `struct_acc`.
+    t: A scalar integer. Performance is better if `t` is on the device
+      memory.
+
+  Returns:
+    A structure of tensors. Say, ret is a returned dictionary. Then, for
+    each key, we have:
+      ret[key] = struct_acc[key];
+      ret[key][t, :] = struct_x[key]
+  """
+  to_skip_update = set()
+  acc_lst = nest.flatten(struct_acc)
+  x_lst = nest.flatten(struct_x)
+  t = math_ops.to_int32([t])  # tf.to_int32 casts on-device tensors.
+  lst = []
+  for acc, x in zip(acc_lst, x_lst):
+    if acc in to_skip_update:
+      # Until b/62105730 is fixed, we need to avoid inplace update for tensors
+      # of rank 1.  could reshape to handle it, but we don't really need the
+      # values applied to these, so just skip their modification.
+      lst += [acc]
+    else:
+      lst += [alias_inplace_update(acc, t, array_ops.expand_dims(x, 0))]
+  return nest.pack_sequence_as(struct_acc, lst)
+
+
+def _SeqLenDim(struct):
+  """Returns the 0-th dim size of tensors in a structure of tensors.
+
+  This is the max sequence length according to the shape of the inputs.
+
+  Args:
+    struct: A structure of tensors. Every tensor's 0-th dim has the same size.
+
+  Returns:
+    A scalar tensor which is the size of 0-th dim of every tensors in struct.
+  """
+  xs = nest.flatten(struct)
+  assert xs
+  dim0 = array_ops.shape(xs[0])[0]
+  return dim0
+
+
+def _Flatten(struct):
+  """Flattens a structure."""
+  return nest.flatten(struct)
+
+
+def _Pack(elements, struct_template):
+  """Packs the list of tensors according to the structure.
+
+  In the event that `elements` should be a scalar, `struct_template` must
+  contain exactly one non-trivial element (for instance, `[[], {'x':elt}]`).
+
+  Args:
+    elements: Elements to be packed. A list of tensor, or a single tensor.
+    struct_template: The container structure in which to pack them.
+  Returns:
+    A python structure of the same type as `struct_template`, containing
+    `elements` as its contained elements.
+  """
+  if not nest.is_sequence(elements):
+    return nest.pack_sequence_as(struct_template, [elements])
+  return nest.pack_sequence_as(struct_template, elements)
+
+
+def _EmptyAcc(slen, struct_template):
+  """Creates a set of accumulators for tensors in structure.
+
+  Args:
+    slen: The sequence length. A scalar tensor.
+    struct_template: A structure of tensors.
+
+  Returns:
+    A structure congruent to `struct_template`. Say ret is a returned
+    dictionary. Then, `ret.key`, a tensor, has the same dtype as
+    `struct_template.key`. The tensor's shape has 1 more dimension
+    than the tensor `struct_template.key`. The extra 0-th dimension is of size
+    `slen`. E.g., if `slen=10` and `struct_template.key`'s shape is `[3, 5]`,
+    then, `ret.key`'s shape is `[10, 3, 5]`.
+  """
+
+  def _EmptyAccForTensor(tensor):
+    return inplace_ops.empty(
+        array_ops.concat([[slen], array_ops.shape(tensor)], axis=0),
+        tensor.dtype,
+        init=True)
+
+  return nest.map_structure(_EmptyAccForTensor, struct_template)
+
+
+def _EmptyLike(struct):
+  """Creates a set of empty initialized tensors.
+
+  Args:
+    struct: A structure of tensors.
+
+  Returns:
+    A struct of tensors. Each tensor has the same shape and dtype as
+    its corresponding tensor in `struct`. And each tensor is initialized.
+  """
+  return nest.map_structure(
+      lambda x: inplace_ops.empty_like(x, init=True), struct)
+
+
+def _Add(struct_x, struct_y):
+  """Adds tensors in `struct_x` with respective tensors in `struct_y`.
+
+  Args:
+    struct_x: A struct of tensors.
+    struct_y: A struct of tensors congruent to `struct_x`.
+
+  Returns:
+    A struct of tensors. Each element of the returned value
+  equals `x + y`, with corresponding values in `struct_x` and `struct_y`.
+  """
+  list_x = nest.flatten(struct_x)
+  list_y = nest.flatten(struct_y)
+  z = []
+  for x, y in zip(list_x, list_y):
+    z += [math_ops.add(x, y)]
+  return nest.pack_sequence_as(struct_x, z)
+
+
+def _Dtypes(struct):
+  """Returns all tensors' data types in a list."""
+  return [x.dtype for x in nest.flatten(struct)]
+
+
+def _ConvertNoneGradientToZeros(xs, dxs):
+  """Sanitize dxs so that None becomes zeros appropriately.
+
+  Args:
+    xs: A list of tensors.
+    dxs: A list of tensors. dxs[i] corresponds to xs[i]'s gradient.
+
+  Returns:
+    A structure same as `dxs` with `None` replaced by a zero tensor.
+  """
+  list_xs = nest.flatten(xs)
+  list_dxs = nest.flatten(dxs)
+
+  # If x does not get any backprop-ed gradient, propagate zeros.
+  rets = []
+  for (x, dx) in zip(list_xs, list_dxs):
+    if dx is None:
+      rets.append(array_ops.zeros_like(x))
+    else:
+      rets.append(dx)
+
+  return nest.pack_sequence_as(dxs, rets)
+
+
+# All structures are flattened for use internally. This is for simplicity
+# and also to use the Defun construct.
+# In the forward pass (inference), the computation is structured as follows.
+# Forward: [gradient = _Recurrent.Grad]
+#   Flatten structures, create accumulators.
+#   for t = 0..max_input_length:
+#     Defun ForwardLoopBody:
+#       Defun Fwd: flatten/pack around cell_fn
+#       state1 = Fwd(inputs[t], state0)
+#       acc_state += [state1]
+#   Pack structures.
+# During the backward pass (backpropping the gradient from the last time
+# step to the first, through the structure), the computation is structured
+# as follows.
+# Grad:
+#   Flatten structures.
+#   Defun Backward:
+#     Create create accumulated derivatives: d_theta, d_inputs, d_acc_state.
+#     Regarding the note at the top of the file, there is only one accumulator
+#     for d_theta accumulated over the whole sequence.
+#     for t = max_input_length -1..0:
+#       Defun BackwardLoopBody:
+#         Retrieve acc_state[t] computed in the forward pass.
+#         Defun Bak: flatten/back around cell_fn_grad.
+#         d_state1 is d_state0 from previous step (ie next time).
+#         d_acc_state[dev_t] += d_state1
+#         d_theta_t, d_state0, d_inputs_t, = Bak()
+#         d_inputs[dev_t] += d_inputs
+#         d_theta += d_theta_t
+#         d_acc_state[t] += d_state1
+#   Pack structures and return.
+class _Recurrent(object):
+  """A helper class to construct a recurrent neural net."""
+
+  def __init__(self, cell_fn, cell_grad, theta, state0, inputs,
+               max_input_length, extras, use_tpu):
+    """RNN helper class.
+
+    Args:
+      cell_fn: A python function, which computes:
+         state1, extras = cell_fn(theta, state0, inputs[t, :])
+      cell_grad: A python function which computes:
+         dtheta, dstate0, dinputs[t, :] = cell_grad(
+           theta, state0, inputs[t, :], extras, dstate1)
+      theta: weights. A structure of tensors.
+      state0: initial state. A structure of tensors.
+      inputs: inputs. A structure of tensors.
+      max_input_length: None, or the maximum effective length of the input over
+        all batches. A scalar tensor.
+      extras: A structure of tensors. The 2nd return value of every
+        invocation of cell_fn is a structure of tensors with matching keys
+        and shapes of this `extras`.
+      use_tpu: A boolean indicating whether the computation is mean to
+        run on a TPU.
+    """
+    self._theta = theta
+    self._state = state0
+    self._inputs = inputs
+    self._max_input_length = self._MaybeComputeMaxInputLength(
+        inputs, max_input_length)
+    self._cell_fn = cell_fn
+    self._cell_grad = cell_grad
+    self._extras = extras
+
+    # pylint: disable=unbalanced-tuple-unpacking
+
+    # NOTE: TF Function (Fwd, Bak, ForwardLoopBody, BackwardLoopBody,
+    # Forward and Backward defined below) simply takes a list of
+    # Tensors and returns a list of Tensors. When we pass in a
+    # structure (a list of structures of Tensors), we use _Flatten to
+    # convert the structure into a list of tensor. Conversely, the
+    # following code often uses _Pack to formulate a structure from a
+    # list of tensors based on a "template".
+
+    # Wraps cell_fn in a TF Function:
+    #    state1 = cell_fn(theta, state0, inputs)
+    fwd_sig = [self._theta, self._state, self._inputs]
+
+    compiled = use_tpu
+    noinline = not compiled
+    dev_t_type = dtypes.int32 if use_tpu else dtypes.int64
+
+    @function.Defun(*_Dtypes(fwd_sig))
+    def Fwd(*args):
+      (theta, state0, inputs) = _Pack(args, fwd_sig)
+      state1, extras = self._cell_fn(theta, state0, inputs)
+      assert not function.get_extra_args(), (
+          'cell_fn is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(state1, self._state)
+      _AssertIsCompatible(extras, self._extras)
+      return _Flatten([state1, extras])
+
+    # Wraps cell_fn in a TF Function as a for-loop's body.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  state0: the previous recurrent state.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state is also stashed into
+    #    acc_state.
+    #  acc_extras: Each timestep's computed extras is stashed into acc_extras
+    fwdloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(fwdloop_sig))
+    def ForwardLoopBody(*args):
+      """The body of forward loop."""
+      t, dev_t = args[0], args[1]
+      (theta, state0, inputs, acc_state, acc_extras) = _Pack(
+          args[2:], fwdloop_sig)
+      inputs_t = _Index(inputs, t)  # external input at time step t.
+      fwd = Fwd(*_Flatten([theta, state0, inputs_t]))
+      state1, extras = _Pack(fwd, [self._state, self._extras])
+      # Saves state1 and extras in their accumulators.
+      acc_state = _Update(acc_state, state1, dev_t)
+      acc_extras = _Update(acc_extras, extras, dev_t)
+
+      return [math_ops.add(dev_t, 1)] + _Flatten(
+          [theta, state1, inputs, acc_state, acc_extras])
+
+    def Grad(op, *args):
+      """The python grad function for the Forward function."""
+
+      # NOTE: tf.gradient backprops None for int32/int64 while zeros
+      # for float32/float64. For consistency, we always backprop
+      # zeros.
+      args = list(args)
+      for i, dy in enumerate(args):
+        if dy is None:
+          args[i] = array_ops.zeros_like(op.outputs[i])
+      # TODO(drpng): getting the extra state here?
+      op_inputs = [x for x in op.inputs]
+      op_struct = [
+          self._theta, self._state, self._inputs, self._max_input_length,
+          self._extras
+      ]
+      (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct)
+      # acc_state and acc_extras are computed by the Forward pass and
+      # needed by the Backward pass.
+      acc_state, _, acc_extras = _Pack([x for x in op.outputs],
+                                       [self._state, self._state, self._extras])
+
+      # Forward computes acc_state, the final state and
+      # acc_extras. tf.gradients gives us their gradients w.r.t. the
+      # final loss. Because acc_extras are not exposed by Compute(),
+      # it has no gradients w.r.t. the final loss (i.e., by
+      # construction, it must be zeros).
+      d_acc_state, d_state1, _ = _Pack(args,
+                                       [self._state, self._state, self._extras])
+      return Backward(*_Flatten([
+          theta, state0, inputs, max_input_length, acc_state, acc_extras,
+          d_acc_state, d_state1
+      ]))
+
+    # Forward calls ForwardLoopBody n times. Each time computes one
+    # time step of the recurrent net.
+    forward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._extras
+    ]
+
+    @function.Defun(
+        *_Dtypes(forward_sig), python_grad_func=Grad, noinline=noinline)
+    def Forward(*args):
+      """Forward pass of the recurrent net."""
+      theta, state0, inputs, max_input_length, extras = _Pack(args, forward_sig)
+
+      slen_dim = _SeqLenDim(inputs)
+
+      # Creates accumulators for state0 and extras.
+      acc_state = _EmptyAcc(slen_dim, state0)
+      acc_extras = _EmptyAcc(slen_dim, extras)
+
+      dev_t = array_ops.constant(0, dtype=dev_t_type)
+      run = functional_ops.For(
+          start=0,
+          limit=max_input_length,
+          delta=1,
+          inputs=[dev_t] + _Flatten(
+              [theta, state0, inputs, acc_state, acc_extras]),
+          body=ForwardLoopBody,
+          rewrite_with_while=compiled)
+      _, state1, _, acc_state, acc_extras = _Pack(
+          run[1:],
+          [self._theta, self._state, self._inputs, self._state, self._extras])
+
+      return _Flatten([acc_state, state1, acc_extras])
+
+    # The per-step backward computes:
+    #    d_theta, d_state0, d_inputs = cell_grad(
+    #        theta, state0, inputs, extras, d_state1)
+    # where d_state1 is the backprop-ed gradient for state1, and
+    # extras is the computed by the forward step to facilitate the
+    # backward step.
+    bak_sig = [
+        self._theta, self._state, self._inputs, self._extras, self._state
+    ]
+
+    @function.Defun(*_Dtypes(bak_sig))
+    def Bak(*args):
+      """Backward step."""
+      (theta, state0, inputs, extras, d_state1) = _Pack(args, bak_sig)
+      (dtheta, dstate0, dinputs) = self._cell_grad(theta, state0, inputs,
+                                                   extras, d_state1)
+      assert not function.get_extra_args(), (
+          'cell_grad is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(dtheta, self._theta)
+      _AssertIsCompatible(dstate0, self._state)
+      _AssertIsCompatible(dinputs, self._inputs)
+      return _Flatten(
+          _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                      [dtheta, dstate0, dinputs]))
+
+    # Define defuns used by a functional_ops.If in BackwardLoopBody.
+    state_if_sig = [self._state, self._state]
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnOrigState0(*args):
+      """Returns original state0 from inputs."""
+      (_, orig_state0) = _Pack(args, state_if_sig)
+      return nest.flatten(orig_state0)
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnAccState(*args):
+      """Returns acc_state[t-1] from inputs."""
+      (acc_state, _) = _Pack(args, state_if_sig)
+      return nest.flatten(acc_state)
+
+    # Wraps cell_grad gradient function in a TF Function as a
+    # for-loop's body for the Backward pass.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  state0: the initial state for the entire backward loop.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state was stashed into
+    #    acc_state by the Forward pass.
+    #  acc_extras: Each timestep's computed extras was stashed into
+    #    acc_extras by the Forward pass.
+    #  d_theta: All timestep's gradient for theta is accumulated (added) into
+    #      d_theta.
+    #  d_state1: The backprop-ed gradient for the new stated computed by
+    #      timestep t.
+    #  d_inputs: d_inputs[t, :] is populated by the backward time step t.
+    #  d_acc_state: The backprop-ed gradient for acc_state.
+    bakloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras,
+        self._theta, self._state, self._inputs, self._state
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(bakloop_sig))
+    def BackwardLoopBody(*args):
+      """Backward loop body function."""
+      t, dev_t = args[0], args[1]
+      (theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+       d_inputs, d_acc_state) = _Pack(args[2:], bakloop_sig)
+
+      # The input recurrent state for time step t is previous time step's
+      # output, or the original state0 when on time step 0.
+      state_from_acc = _Index(acc_state, math_ops.maximum(0, t - 1))
+      state0 = functional_ops.If(
+          math_ops.equal(t, array_ops.constant(0, dtypes.int32)),
+          _Flatten([state_from_acc, orig_state0]), ReturnOrigState0,
+          ReturnAccState)
+      state0 = nest.pack_sequence_as(orig_state0, state0)
+
+      # The external inputs for time step t.
+      inputs_t = _Index(inputs, t)
+      # The extras for time step t.
+      extras_t = _Index(acc_extras, t)
+
+      d_state1 = _Add(_Index(d_acc_state, t), d_state1)
+      (d_theta_t, d_state0, d_inputs_t) = _Pack(
+          Bak(*_Flatten([theta, state0, inputs_t, extras_t, d_state1])),
+          [self._theta, self._state, self._inputs])
+      d_theta = _Add(d_theta, d_theta_t)
+      d_inputs = _Update(d_inputs, d_inputs_t, dev_t)
+      return [math_ops.subtract(dev_t, 1)] + _Flatten([
+          theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+          d_inputs, d_acc_state
+      ])
+
+    # Backward calls BackwardLoopBody n times.  Each time computes the backprop
+    # for one time step of the recurrent net.
+    backward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._state, self._extras, self._state, self._state
+    ]
+
+    @function.Defun(*_Dtypes(backward_sig), noinline=noinline)
+    def Backward(*args):
+      """Backward pass for the recurrent net."""
+      # theta, state0, inputs are Forward's inputs.
+      # acc_state is the accumulated 1st output of Forward.
+      # acc_extras is the accumulated 2nd output of Forward.
+      # d_acc_state is the gradient for acc_state.
+      # d_state1 is the gradient for the final state computed by Forward.
+      (theta, state0, inputs, max_input_length, acc_state, acc_extras,
+       d_acc_state, d_state1) = _Pack(args, backward_sig)
+
+      # Accumulators for gradients.
+      d_theta = _EmptyLike(theta)
+      d_inputs = _EmptyLike(inputs)
+
+      # Loop backwards. Note the loop's limit is open-ended, so goes through
+      # t=0.
+      t = max_input_length - 1
+      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      run = functional_ops.For(
+          start=t,
+          limit=-1,
+          delta=-1,
+          inputs=[dev_t] + _Flatten([
+              theta, state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+              d_inputs, d_acc_state
+          ]),
+          body=BackwardLoopBody,
+          rewrite_with_while=compiled)
+
+      (theta, state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+       d_inputs, d_acc_state) = _Pack(run[1:], bakloop_sig)
+
+      d_max_input_length = array_ops.constant(0, dtype=max_input_length.dtype)
+      return _Flatten(
+          [d_theta, d_state0, d_inputs, d_max_input_length, acc_extras])
+
+    self._forward = Forward
+
+  def _MaybeComputeMaxInputLength(self, inputs, max_input_length):
+    if max_input_length is not None:
+      return max_input_length
+    return math_ops.reduce_max(array_ops.shape(nest.flatten(inputs)[0])[0])
+
+  def Compute(self):
+    return _Pack(
+        self._forward(*_Flatten([
+            self._theta, self._state, self._inputs, self._max_input_length,
+            self._extras
+        ])), [self._state, self._state, self._extras])[:2]
+
+
+def _GetCellGrad(cell_fn, cell_grad):
+  """Returns the gradient function for cell_fn.
+
+  Args:
+    cell_fn: The recurrent neural net's cell function.
+    cell_grad: If not None, cell_fn's gradient function.
+
+  Returns:
+    Returns cell_grad if not None. Otherwise, assume cell_fn is a python
+    function representing the recurrent neural net's cell function, i.e.,
+      cell_fn: (theta, state0, inputs) -> (state1, extra)
+    returns its default gradient python function, i.e.,
+      cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+  """
+
+  if cell_grad:
+    return cell_grad
+
+  def CellGrad(theta, state0, inputs, extras, dstate1):
+    """Default gradient function for cell_fn."""
+    # NOTE: The default grad function recomputes the forward
+    # function and does not take advantage of 'extras' returned by
+    # the forward function.
+    del extras
+    state1, extras = cell_fn(theta, state0, inputs)
+    ys = _Flatten([state1])
+    xs = _Flatten([theta, state0, inputs])
+    grad_ys = _Flatten([dstate1])
+    grads = gradients_impl.gradients(ys=ys, xs=xs, grad_ys=grad_ys)
+    return _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                       _Pack(grads, [theta, state0, inputs]))
+
+  return CellGrad
+
+
+def _IsSingleTimeStep(inputs, max_input_length):
+  """Returns True only if the time dimension of inputs is 1."""
+  if not isinstance(max_input_length, ops.Tensor):
+    return max_input_length == 1
+  for x in nest.flatten(inputs):
+    if x.shape.dims is None or x.shape[0].value != 1:
+      return False
+  return True
+
+
+def Recurrent(theta,
+              state0,
+              inputs,
+              cell_fn,
+              cell_grad=None,
+              extras=None,
+              max_input_length=None,
+              use_tpu=False):
+  """Compute a recurrent neural net.
+
+  Roughly, Recurrent() computes the following:
+    state = state0
+    for t in inputs' sequence length:
+      state = cell_fn(theta, state, inputs[t, :])
+      accumulate_state[t, :] = state
+    return accumulate_state, state
+
+  theta, state, inputs are all structures of tensors.
+
+  inputs[t, :] means taking a slice out from every tensor in the inputs.
+
+  accumulate_state[t, :] = state means that we stash every tensor in
+  'state' into a slice of the corresponding tensor in
+  accumulate_state.
+
+  cell_fn is a python callable computing (building up a TensorFlow
+  graph) the recurrent neural network's one forward step. Two calls of
+  cell_fn must describe two identical computations.
+
+  By construction, Recurrent()'s backward computation does not access
+  any intermediate values computed by cell_fn during forward
+  computation. We may extend Recurrent() to support that by taking a
+  customized backward function of cell_fn.
+
+  Args:
+    theta: weights. A structure of tensors.
+    state0: initial state. A structure of tensors.
+    inputs: inputs. A structure of tensors.
+    cell_fn: A python function, which computes:
+      state1, extras = cell_fn(theta, state0, inputs[t, :])
+    cell_grad: A python function which computes:
+      dtheta, dstate0, dinputs[t, :] = cell_grad(
+        theta, state0, inputs[t, :], extras, dstate1)
+    extras: A structure of tensors. The 2nd return value of every
+      invocation of cell_fn is a structure of tensors with matching keys
+      and shapes of  this `extras`.
+    max_input_length: maximum length of effective input. This is used to
+      truncate the computation if the inputs have been allocated to a
+      larger size. A scalar tensor.
+    use_tpu: whether or not we are on TPU.
+
+  Returns:
+    accumulate_state and the final state.
+  """
+  if cell_grad is None and _IsSingleTimeStep(inputs, max_input_length):
+    # The seqlen length is staticly known as 1. Hence, we just need to
+    # call cell_fn once without putting it into a loop.
+    inputs = nest.map_structure(lambda x: array_ops.squeeze(x, axis=0), inputs)
+    state1, _ = cell_fn(theta, state0, inputs)
+    acc_state = nest.map_structure(lambda x: array_ops.expand_dims(x, axis=0),
+                                   state1)
+    return acc_state, state1
+
+  # If cell_grad is not given, derives the gradient function from
+  # cell_fn.
+  cell_grad = _GetCellGrad(cell_fn, cell_grad)
+
+  if extras is None:
+    # Derives 'extras' so that we can allocate extras' accumulator.
+    _, extras = cell_fn(theta, state0, _Index(inputs, 0))
+    extras = nest.map_structure(array_ops.zeros_like, extras)
+  else:
+    _, actual = cell_fn(theta, state0, _Index(inputs, 0))
+    _AssertIsCompatible(extras, actual)
+
+  return _Recurrent(
+      cell_fn=cell_fn,
+      cell_grad=cell_grad,
+      theta=theta,
+      state0=state0,
+      inputs=inputs,
+      max_input_length=max_input_length,
+      extras=extras,
+      use_tpu=use_tpu).Compute()
diff --git a/tensorflow/experimental_api.py b/tensorflow/contrib/recurrent/python/recurrent_api.py
similarity index 53%
rename from tensorflow/experimental_api.py
rename to tensorflow/contrib/recurrent/python/recurrent_api.py
index 63a8aa9cb1dc130a7999c3b248815633998c4cd0..ffe1dcf7dc49554db56ee8e8fabedf976310a554 100644
--- a/tensorflow/experimental_api.py
+++ b/tensorflow/contrib/recurrent/python/recurrent_api.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-# Bring in all of the public TensorFlow interface into this
-# module.
+"""Recurrent computations library."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.lazy_loader import LazyLoader
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
-del LazyLoader
-
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
+# pylint: disable=unused-import
+from tensorflow.contrib.recurrent.python.ops import functional_bidirectional_rnn
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.contrib.recurrent.python.ops import Recurrent
+# pylint: enable=unused-import
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..597f18c77197127cf99a3fbd0d2d22cac9131792
--- /dev/null
+++ b/tensorflow/contrib/rpc/BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "rpc",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
+)
diff --git a/tensorflow/contrib/rpc/__init__.py b/tensorflow/contrib/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65c1a05def92b91ddd75e7aecdb4e4d9b8abe8a
--- /dev/null
+++ b/tensorflow/contrib/rpc/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to RPC.
+
+@@rpc
+@@try_rpc
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.rpc_op import try_rpc
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/rpc/python/ops/BUILD b/tensorflow/contrib/rpc/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..84d2a1832f14b61ec313e7a1a00b0672bc410cfb
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+py_library(
+    name = "rpc_op_py",
+    srcs = ["rpc_op.py"],
+    deps = [
+        ":gen_rpc_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_rpc_op_py",
+    out = "gen_rpc_op.py",
+    deps = [
+        "//tensorflow/core:rpc_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/ops/rpc_op.py b/tensorflow/contrib/rpc/python/ops/rpc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b6c41137828950e73757579dca1eba4adf2ae4
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/rpc_op.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""RPC communication."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import try_rpc
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("Rpc")
+ops.NotDifferentiable("TryRpc")
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index e431c464ef14e86faf30221ed06061f41da528fb..26fd4e2023806765ea4088f4c13a780ca7338bff 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -48,16 +48,14 @@ py_library(
     ],
 )
 
-py_test(
-    name = "reader_test",
-    size = "small",
-    srcs = ["python/saved_model/reader_test.py"],
+py_library(
+    name = "reader",
+    srcs = ["python/saved_model/reader.py"],
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
-    visibility = ["//visibility:private"],
+    visibility = ["//visibility:public"],
     deps = [
         ":saved_model_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:variables",
@@ -66,6 +64,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reader_test",
+    size = "small",
+    srcs = ["python/saved_model/reader_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":reader",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "signature_def_utils_test",
     size = "small",
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d508cf3f9db81aa7c3a1174ed13f2310b0595b04..6781433a1f7ac712a62cfd19f1a2ecb632509fd4 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -110,7 +111,12 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
+                         attention_layer=None,
                          name=''):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None)
+    attention_layers = (
+        [attention_layer] if attention_layer is not None else None)
     self._testWithMaybeMultiAttention(
         is_multi=False,
         create_attention_mechanisms=[create_attention_mechanism],
@@ -119,7 +125,8 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depths=[attention_mechanism_depth],
         alignment_history=alignment_history,
         expected_final_alignment_history=expected_final_alignment_history,
-        attention_layer_sizes=[attention_layer_size],
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
         name=name)
 
   def _testWithMaybeMultiAttention(self,
@@ -131,6 +138,7 @@ class AttentionWrapperTest(test.TestCase):
                                    alignment_history=False,
                                    expected_final_alignment_history=None,
                                    attention_layer_sizes=None,
+                                   attention_layers=None,
                                    name=''):
     # Allow is_multi to be True with a single mechanism to enable test for
     # passing in a single mechanism in a list.
@@ -144,12 +152,18 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_sizes is None:
-      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-    else:
+    if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
       attention_depth = sum([attention_layer_size or encoder_output_depth
                              for attention_layer_size in attention_layer_sizes])
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth])[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -171,13 +185,20 @@ class AttentionWrapperTest(test.TestCase):
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+          if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+          if attention_layer is not None:
+            attention_layer = attention_layer[0]
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
             attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=(attention_layer_sizes if is_multi
-                                  else attention_layer_sizes[0]),
-            alignment_history=alignment_history)
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -260,6 +281,41 @@ class AttentionWrapperTest(test.TestCase):
             expected_final_alignment_history,
             final_alignment_history_info)
 
+  def testBahdanauNormalizedDType(self):
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.BahdanauAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          normalize=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
@@ -797,6 +853,48 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testMultiAttentionWithLayerInstances(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layers=[layers_core.Dense(3, use_bias=False),
+                          layers_core.Dense(4, use_bias=False)],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index f0f143ddfcf17c0e471add804ac4920b02da68e0..a0f57417b81b475cfd07ffafd1620069ad595767 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -472,7 +472,8 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=math.sqrt((1. / num_units)))
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
+        shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
@@ -1082,7 +1083,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                cell_input_fn=None,
                output_attention=True,
                initial_cell_state=None,
-               name=None):
+               name=None,
+               attention_layer=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1125,7 +1127,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (default), use the context as attention at each time step. Otherwise,
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
-        attention_layer_size must be a list of the same length.
+        attention_layer_size must be a list of the same length. If
+        attention_layer is set, this must be None.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1145,12 +1148,19 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+      attention_layer: A list of `tf.layers.Layer` instances or a
+        single `tf.layers.Layer` instance taking the context and cell output as
+        inputs to generate attention at each time step. If None (default), use
+        the context as attention at each time step. If attention_mechanism is a
+        list, attention_layer must be a list of the same length. If
+        attention_layers_size is set, this must be None.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
         is a list but `attention_layer_size` is not; or vice versa).
       ValueError: if `attention_layer_size` is not None, `attention_mechanism`
-        is a list, and its length does not match that of `attention_layer_size`.
+        is a list, and its length does not match that of `attention_layer_size`;
+        if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
     super(AttentionWrapper, self).__init__(name=name)
     rnn_cell_impl.assert_like_rnncell("cell", cell)
@@ -1181,6 +1191,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
 
+    if attention_layer_size is not None and attention_layer is not None:
+      raise ValueError("Only one of attention_layer_size and attention_layer "
+                       "should be set")
+
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
           attention_layer_size
@@ -1199,6 +1213,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
+    elif attention_layer is not None:
+      self._attention_layers = tuple(
+          attention_layer
+          if isinstance(attention_layer, (list, tuple))
+          else (attention_layer,))
+      if len(self._attention_layers) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer must contain exactly one "
+            "layer per attention_mechanism, saw: %d vs %d"
+            % (len(self._attention_layers), len(attention_mechanisms)))
+      self._attention_layer_size = sum(
+          layer.compute_output_shape(
+              [None,
+               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+          for layer, mechanism in zip(
+              self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 35c4b5bec172858b39dd4628a37e164efe87bdbf..345eb6cfaa67fd4cda6e7e3f01a1243bbf3c9fa1 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.signal.python.kernel_tests import test_util
 from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -173,6 +174,18 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  def test_num_spectrogram_bins_dynamic(self):
+    with self.test_session(use_gpu=True):
+      num_spectrogram_bins = array_ops.placeholder(shape=(),
+                                                   dtype=dtypes.int32)
+      mel_matrix_np = spectrogram_to_mel_matrix(
+          20, 129, 8000.0, 125.0, 3800.0)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+      self.assertAllClose(
+          mel_matrix_np,
+          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index d1a36548d95cf44d2bf7e6108141aeb00853db04..1e84006116daa3f28c760037cb9eeafd53eaafb8 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -64,14 +64,11 @@ def _hertz_to_mel(frequencies_hertz, name=None):
         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
 
 
-def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+def _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype):
   """Checks the inputs to linear_to_mel_weight_matrix."""
   if num_mel_bins <= 0:
     raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
-  if num_spectrogram_bins <= 0:
-    raise ValueError('num_spectrogram_bins must be positive. Got: %s' %
-                     num_spectrogram_bins)
   if sample_rate <= 0.0:
     raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
   if lower_edge_hertz < 0.0:
@@ -122,9 +119,9 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
-    num_spectrogram_bins: Python int. How many bins there are in the source
-      spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the
-      spectrogram only contains the nonredundant FFT bins.
+    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
+      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
+      i.e. the spectrogram only contains the nonredundant FFT bins.
     sample_rate: Python float. Samples per second of the input signal used to
       create the spectrogram. We need this to figure out the actual frequencies
       for each spectrogram bin, which dictates how they are mapped into the mel
@@ -148,7 +145,10 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
   with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
-    _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
+    # and the validation is already done in linspace (both in shape function
+    # and in kernel), there is no need to validate num_spectrogram_bins here.
+    _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
     # To preserve accuracy, we compute the matrix at float64 precision and then
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 40f484fd78302163ba36142dec057478fe899189..746b95564237617359afe1791484809369c4a894 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -290,9 +290,9 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 
 In addition to the types of scope mechanisms in TensorFlow
 ([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope)),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope).
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -761,8 +761,8 @@ parts:
 3. Finalization: (optionally) perform any final operation to compute metric
 values. For example, computing means, mins, maxes, etc.
 
-For example, to compute `mean_absolute_error`, two variables, a `count` and
-`total` variable are *initialized* to zero. During *aggregation*, we observed
+For example, to compute `mean_absolute_error`, two variables (`count` and
+`total`) are *initialized* to zero. During *aggregation*, we observed
 some set of predictions and labels, compute their absolute differences and add
 the total to `total`. Each time we observe another value,
 `count` is incremented. Finally, during *finalization*, `total` is divided
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 3caf4e02da3aa2d7e586c4e76807a11f84585ea6..5cfd5ee82e2a0fce33311a8783d2d4ceb031544d 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -230,6 +230,7 @@ def evaluation_loop(master,
                     max_number_of_evaluations=None,
                     session_config=None,
                     timeout=None,
+                    timeout_fn=None,
                     hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
@@ -261,6 +262,9 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
     hooks: A list of additional `SessionRunHook` objects to pass during
       repeated evaluations.
 
@@ -298,4 +302,5 @@ def evaluation_loop(master,
       hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
-      timeout=timeout)
+      timeout=timeout,
+      timeout_fn=timeout_fn)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index c24bd048512daaae116e732ac437f7c9b6f6d7fc..94fc12ca814721acf62f16b72ffa50473043cc8b 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -177,6 +177,17 @@ class EvaluationTest(test.TestCase):
     # The timeout kicked in.
     self.assertLess(end, start + 1.1)
 
+  def testTimeoutFnOnEvaluationLoop(self):
+    # We require a mutable object (e.g. list but not an int) to maintain state
+    # across calls of a nested function.
+    timeout_fn_calls = [0]
+    def _TimeoutFn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] >= 3
+    # Need not do any evaluation, but should just call timeout_fn repeatedly.
+    evaluation.evaluation_loop('', '', '', timeout=0, timeout_fn=_TimeoutFn)
+    self.assertEqual(timeout_fn_calls[0], 3)
+
   def testMonitorCheckpointsLoopTimeout(self):
     ret = list(
         evaluation_lib.checkpoints_iterator(
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index ca937546f50df46b7e5b1144dcbdc380cb04ca9b..0cca40f071c889773736ce009b32ba17728041ce 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -22,6 +22,7 @@ WARNING: These ops are in contrib, and are not stable.  They should be
 consistent across multiple runs on the same hardware, but only for the same
 version of the code.
 
+@@stateless_multinomial
 @@stateless_random_uniform
 @@stateless_random_normal
 @@stateless_truncated_normal
@@ -37,6 +38,7 @@ from tensorflow.contrib.stateless.gen_stateless_random_ops import *
 from tensorflow.python.framework import ops
 from tensorflow.python.util.all_util import remove_undocumented
 
+ops.NotDifferentiable("StatelessMultinomial")
 ops.NotDifferentiable("StatelessRandomNormal")
 ops.NotDifferentiable("StatelessRandomUniform")
 ops.NotDifferentiable("StatelessTruncatedNormal")
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index bea6341cfdcf7d56f255bec275b7861228e44e12..d724a5c014d2f9f5f6e3a6704341bcb8c429ae06 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -96,6 +96,52 @@ class StatelessOpsTest(test.TestCase):
               for s1, v1 in values:
                 self.assertEqual(s0 == s1, np.all(v0 == v1))
 
+  def testMatchStatefulMultinomial(self):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    num_samples = 4
+    for logits_dtype in np.float16, np.float32, np.float64:
+      for output_dtype in dtypes.int32, dtypes.int64:
+        for seed in (7, 17), (11, 5), (2, 3):
+          preseed = invert_philox(key,
+                                  (seed[0], 0, seed[1], 0)).astype(np.uint64)
+          preseed = preseed[::2] | preseed[1::2] << 32
+          random_seed.set_random_seed(seed[0])
+          with self.test_session(use_gpu=True):
+            for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                      [0.25, 0.75]]):
+              logits_t = constant_op.constant(logits, dtype=logits_dtype)
+              stateful = random_ops.multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=seed[1],
+                  output_dtype=output_dtype)
+              pure = stateless.stateless_multinomial(
+                  logits_t,
+                  num_samples,
+                  seed=preseed,
+                  output_dtype=output_dtype)
+              self.assertAllEqual(stateful.eval(), pure.eval())
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.test_session(use_gpu=True):
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                  [0.25, 0.75]]):
+          pure = stateless.stateless_multinomial(
+              logits, num_samples, seed=seed_t)
+          values = [
+              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+          ]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index fda1367b156c86f385f31cc41c5fca747cf8668d..f88b03ec4c2b1f250091594ea12d7d1862029fa2 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -15,7 +15,6 @@ py_test(
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -23,6 +22,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,7 +35,6 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -44,31 +43,9 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "summary_ops",
-    srcs = ["summary_ops.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:summary_ops_gen",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
 )
@@ -79,7 +56,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":summary_ops",
+        "//tensorflow/python:summary_ops_v2",
     ],
 )
 
@@ -92,8 +69,10 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
         "@org_sqlite//:python",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 2d6d7ea6a3eff2562ba8def4117e3aa6f818b6fd..99ced53e1167ec5486d0b75cff81ffbf857c2be7 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -61,23 +61,23 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.contrib.summary.summary_ops import all_summary_ops
-from tensorflow.contrib.summary.summary_ops import always_record_summaries
-from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_db_writer
-from tensorflow.contrib.summary.summary_ops import create_file_writer
-from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
-from tensorflow.contrib.summary.summary_ops import eval_dir
-from tensorflow.contrib.summary.summary_ops import flush
-from tensorflow.contrib.summary.summary_ops import generic
-from tensorflow.contrib.summary.summary_ops import graph
-from tensorflow.contrib.summary.summary_ops import histogram
-from tensorflow.contrib.summary.summary_ops import image
-from tensorflow.contrib.summary.summary_ops import import_event
-from tensorflow.contrib.summary.summary_ops import initialize
-from tensorflow.contrib.summary.summary_ops import never_record_summaries
-from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
-from tensorflow.contrib.summary.summary_ops import scalar
-from tensorflow.contrib.summary.summary_ops import should_record_summaries
-from tensorflow.contrib.summary.summary_ops import summary_writer_initializer_op
-from tensorflow.contrib.summary.summary_ops import SummaryWriter
+from tensorflow.python.ops.summary_ops_v2 import all_summary_ops
+from tensorflow.python.ops.summary_ops_v2 import always_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import audio
+from tensorflow.python.ops.summary_ops_v2 import create_db_writer
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer
+from tensorflow.python.ops.summary_ops_v2 import create_summary_file_writer
+from tensorflow.python.ops.summary_ops_v2 import eval_dir
+from tensorflow.python.ops.summary_ops_v2 import flush
+from tensorflow.python.ops.summary_ops_v2 import generic
+from tensorflow.python.ops.summary_ops_v2 import graph
+from tensorflow.python.ops.summary_ops_v2 import histogram
+from tensorflow.python.ops.summary_ops_v2 import image
+from tensorflow.python.ops.summary_ops_v2 import import_event
+from tensorflow.python.ops.summary_ops_v2 import initialize
+from tensorflow.python.ops.summary_ops_v2 import never_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps
+from tensorflow.python.ops.summary_ops_v2 import scalar
+from tensorflow.python.ops.summary_ops_v2 import should_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import summary_writer_initializer_op
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 3aba04540eba12092d884cca10e23546eb91c91d..ae8336daaf8ea9113716b90b6ea9be9de7303596 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -16,27 +16,220 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
 get_all = summary_test_util.get_all
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class GraphFileTest(test_util.TensorFlowTestCase):
+
+  def testSummaryOps(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.generic('tensor', 1, step=1)
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.histogram('histogram', [1.0], step=1)
+      summary_ops.image('image', [[[[1.0]]]], step=1)
+      summary_ops.audio('audio', [[1.0]], 1.0, 1, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    # The working condition of the ops is tested in the C++ test so we just
+    # test here that we're calling them correctly.
+    self.assertTrue(gfile.Exists(logdir))
+
+  def testSummaryName(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scalar', events[1].summary.value[0].tag)
+
+  def testSummaryNameScope(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      with ops.name_scope('scope'):
+        summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scope/scalar', events[1].summary.value[0].tag)
+
+  def testSummaryGlobalStep(self):
+    training_util.get_or_create_global_step()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(summary_ops.summary_writer_initializer_op())
+      step, _ = sess.run(
+          [training_util.get_global_step(), summary_ops.all_summary_ops()])
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(step, events[1].step)
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=1, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=999999, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+      flush_op = summary_ops.flush()
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(flush_op)
+      self.assertEqual(2, get_total())
+      # Test "writer" parameter
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer))
+      self.assertEqual(3, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer._resource))  # pylint:disable=protected-access
+      self.assertEqual(4, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+      # Create with different shared name (should be separate resource/file)
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+
+    with self.test_session() as sess:
+      # Run init ops across writers sequentially to avoid race condition.
+      # TODO(nickfelt): fix race condition in resource manager lookup or create
+      sess.run(writer1.init())
+      sess.run(writer2.init())
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer3.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run([writer1.flush(), writer2.flush(), writer3.flush()])
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['one', 'two'], tags)
+
+    # Second file has tag "three"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['three'], tags)
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      # Running init() again while writer is open has no effect
+      sess.run(writer.init())
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Running close() should do an implicit flush
+      sess.run(writer.close())
+      self.assertEqual(2, get_total())
+      # Running init() on a closed writer should start a new file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(writer.close())
+      files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+      self.assertEqual(2, len(files))
+      self.assertEqual(2, len(summary_test_util.events_from_file(files[1])))
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(writer.flush())
+      self.assertEqual(2, get_total())
+
+
+class GraphDbTest(summary_test_util.SummaryDbTest):
 
   def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
     with self.assertRaises(TypeError):
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index c756f8b27055f9cf86a311e485d97745a3c7a95b..f1ef218e74bbd225071324a8269fdfeb5de0e038 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -16,12 +16,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import numpy as np
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -33,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
@@ -57,7 +59,7 @@ _NUMPY_NUMERIC_TYPES = {
 }
 
 
-class TargetTest(test_util.TensorFlowTestCase):
+class EagerFileTest(test_util.TensorFlowTestCase):
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -138,21 +140,22 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
-        logs, max_queue=2, flush_millis=999999,
+        logs, max_queue=1, flush_millis=999999,
         name='lol').as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
       summary_ops.scalar('scalar', 2.0, step=1)
       self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
       summary_ops.scalar('scalar', 2.0, step=2)
       self.assertEqual(3, get_total())
 
-  def testFlush(self):
+  def testFlushFunction(self):
     logs = tempfile.mkdtemp()
-    with summary_ops.create_file_writer(
-        logs, max_queue=999999, flush_millis=999999,
-        name='lol').as_default(), summary_ops.always_record_summaries():
+    writer = summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999, name='lol')
+    with writer.as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
@@ -161,9 +164,103 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(1, get_total())
       summary_ops.flush()
       self.assertEqual(3, get_total())
+      # Test "writer" parameter
+      summary_ops.scalar('scalar', 2.0, step=3)
+      summary_ops.flush(writer=writer)
+      self.assertEqual(4, get_total())
+      summary_ops.scalar('scalar', 2.0, step=4)
+      summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+      self.assertEqual(5, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('one', next(events).summary.value[0].tag)
+    self.assertEqual('two', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('three', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, get_total())
+      try:
+        # Not using .as_default() to avoid implicit flush when exiting
+        writer.set_as_default()
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, get_total())
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+        self.assertEqual(2, len(files))
+        get_total = lambda: len(summary_test_util.events_from_file(files[1]))
+        self.assertEqual(1, get_total())  # file_version Event
+        summary_ops.scalar('two', 2.0, step=2)
+        writer.close()
+        self.assertEqual(2, get_total())
+      finally:
+        # Clean up by resetting default writer
+        summary_ops.create_file_writer(None).set_as_default()
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.scalar('two', 2.0, step=2)
+      # Exiting the "as_default()" should do an implicit flush of the "two" tag
+      self.assertEqual(3, get_total())
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index d0d3384735fb1eb1a048c7aa6da0037ee9fc6936..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_db_writer = functools.partial(
-        summary_ops.create_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 8506c4be9c4ca8305b62da17c7246e6e18313bd3..b4ae43302cb22ad17c04050eb84433c470757bf1 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -24,10 +24,10 @@ import os
 
 import sqlite3
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 85b3e7231bcb433e9510522597c03c5f764f06cf..3f24f58f03aac2ba6d368d7eccf8731f611a81b4 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -132,7 +132,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   Status WriteEvent(std::unique_ptr<Event> event) override {
     mutex_lock ml(mu_);
     queue_.emplace_back(std::move(event));
-    if (queue_.size() >= max_queue_ ||
+    if (queue_.size() > max_queue_ ||
         env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
       return InternalFlush();
     }
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b35e190c7e438a253a7395b0c5c2ee16..fd3582e175ee91ba7222d41fd3f834c522b5e28d 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -183,6 +182,7 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 6eafc1754ca5102c8adf04f00e33dc2f8ff970f6..687dee07e1327d50fabc4e14c25a357ae6c959e7 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,59 +1,29 @@
 # Using TensorRT in TensorFlow
 
-
-This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT. This is still a work in progress
-but should be useable with most common graphs.
+This module provides necessary bindings and introduces TRT_engine_op operator
+that wraps a subgraph in TensorRT. This is still a work in progress but should
+be useable with most common graphs.
 
 ## Compilation
 
-
-In order to compile the module, you need to have a local TensorRT
-installation ( libnvinfer.so and respective include files ). During the
-configuration step, TensorRT should be enabled and installation path
-should be set. If installed through package managers (deb,rpm),
-configure script should find the necessary components from the system
-automatically. If installed from tar packages, user has to set path to
-location where the library is installed during configuration.
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
 
 ```shell
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py script
+After the installation of tensorflow package, TensorRT transformation will be
+available. An example use can be found in test/test_tftrt.py script
 
 ## Installing TensorRT 3.0.4
 
-In order to make use of TensorRT integration, you will need a local installation of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). Due to compiler compatibility, you will need to download and install the TensorRT 3.0.4 tarball for _Ubuntu 14.04_, i.e., **_TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz_**, even if you are using Ubuntu 16.04 or later.
-
-### Preparing TensorRT installation
-
-Once you have downloaded TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz, you will need to unpack it to an installation directory, which will be referred to as <install_dir>. Please replace <install_dir> with the full path of actual installation directory you choose in commands below.
-
-```shell
-cd <install_dir> && tar -zxf /path/to/TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz
-```
-
-After unpacking the binaries, you have several options to use them:
-
-#### To run TensorFlow as a user without superuser privileges
-
-For a regular user without any sudo rights, you should add TensorRT to your `$LD_LIBRARY_PATH`:
-
-  ```shell
-   export LD_LIBRARY_PATH=<install_dir>/TensorRT-3.0.4/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-  ```
-
-Then you are ready to use TensorFlow-TensorRT integration. `$LD_LIBRARY_PATH` must contain the path to TensorRT installation for TensorFlow-TensorRT integration to work. If you are using a VirtualEnv-like setup, you can add the command above to your `bin/activate` script or to your `.bashrc` script.
-
-#### To run TensorFlow as a superuser
-
- When running as a superuser, such as in a container or via sudo, the `$LD_LIBRARY_PATH` approach above may not work. The following is preferred when the user has superuser privileges:
-
-  ```shell
-  echo "<install_dir>/TensorRT-3.0.4/lib" | sudo tee /etc/ld.so.conf.d/tensorrt304.conf && sudo ldconfig
-  ```
-
-  Please ensure that any existing deb package installation of TensorRT is removed before following these instructions to avoid package conflicts.
\ No newline at end of file
+In order to make use of TensorRT integration, you will need a local installation
+of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow Installation page](https://www.tensorflow.org/install/install_linux#nvidia_requirements_to_run_tensorflow_with_gpu_support).
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd6704e2f41bde1dfabd411e86669ecd..9c3698e5d1cc5d6d8d31a8fcaf03d103f1e1915d 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr<TRTResourceManager>
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr<tensorflow::ResourceMgr>
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3c13e8911b0b95c3e95e19afe4d59c0..bc15b51e05ef743d0aa260bbd9bd21302a752ec0 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr<TRTResourceManager> instance() {
-    static std::shared_ptr<TRTResourceManager> instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr<TRTResourceManager> instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
 
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index ca56e38ca079f71b38cf29605a295a50929945e8..c58e24e6d9748868791d21b0ff4ec28ca2f646c3 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -36,17 +36,14 @@ class LSTMExampleTest(test.TestCase):
   def test_periodicity_learned(self):
     (observed_times, observed_values,
      all_times, predicted_values) = lstm.train_and_predict(
-         training_steps=100, estimator_config=_SeedRunConfig(),
+         training_steps=2, estimator_config=_SeedRunConfig(),
          export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
     self.assertAllEqual([200, 5], predicted_values.shape)
-    self.assertGreater(
-        predicted_values[100, 4]
-        - predicted_values[115, 4],  # Amplitude of fifth component
-        0.2)
-
+    # TODO(allenl): Make the model deterministic so you can check something
+    # substantive.
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 4de09dd9881474e1c84f19acd9598237b58f5eed..9646d15486ef618f206936ce55a5eb6ca0387e41 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -46,7 +46,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
-        "//tensorflow/contrib/summary:summary_ops",
+        "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -56,6 +56,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
@@ -197,7 +198,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 1c32993e8e546a17b8b3c289a306ad8f8388c345..dbf1ab6bbf0ddc7429d8e19279451eb862981e0c 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -46,6 +46,7 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":dump_tpu_profile",
+        ":tpu_profiler_analysis_proto_cc",
         ":tpu_profiler_proto_cc",
         ":version",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 6b198dbc16e544686e35c1ffee8a7f4d3955dafc..a5358842630bed15ad4f0b71ec2d4042f3223ca1 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -40,6 +41,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
+using ::tensorflow::grpc::TPUProfileAnalysis;
 using ::tensorflow::TPUProfiler;
 
 constexpr uint64 kMaxEvents = 1000000;
@@ -64,11 +66,10 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
-             const string& repository_root, const string& session_id,
-             const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
@@ -83,6 +84,17 @@ bool Profile(const string& service_addr, const string& logdir, int duration_ms,
   *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
@@ -120,7 +132,36 @@ bool NewSession(const string& service_addr,
                 const std::vector<tensorflow::string>& hostnames,
                 int duration_ms, const string& repository_root,
                 const string& session_id, const ProfileOptions& opts) {
-  return true;
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  std::copy(
+      hostnames.begin(), hostnames.end(),
+      proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts()));
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
+      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_QCHECK_OK(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for hosts:"
+            << str_util::Join(hostnames, ",");
+  return new_session_response.empty_trace();
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ae508583f848a94d9a52d8663af96d85d8fff74c..b53f9be2e22902a3123f63a20cd758c722e590dc 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -64,7 +64,8 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
 
 Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
                                const string& encoded_trace, std::ostream* os) {
-  string proto_path = JoinPath(run_dir, kProtoTraceFileName);
+  string proto_path =
+      JoinPath(run_dir, StrCat(host_prefix, kProtoTraceFileName));
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 2a158756279b5be7f818950b7f7b70571b5bc38e..63955d18068fc9d3b3ca1a657a3fd526edf10e6f 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -66,6 +66,10 @@ message OpMetricsDbResult {
   // The total of the difference between the start times of two
   // consecutive infeed-enqueues (per host) in picoseconds.
   optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
+  // The total device time in microseconds.
+  optional double total_device_time_in_us = 4;
+  // The total host time in microseconds.
+  optional double total_host_time_in_us = 5;
 }
 
 // Result proto for StepInfo.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 8505c4bc69b9444519d0bc5d23bab093b8a57163..7be694e866729c58efae4ccf7932dd929c03ed91 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -96,5 +96,10 @@ message ProfileResponse {
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
-  // next-field: 7
+
+  // When we write profiling data directly to repository directory, we need a
+  // way to figure out whether the captured trace is empty (due to idle TPU).
+  bool empty_trace = 7;
+
+  // next-field: 8
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
index a4fc8d4e879eb85522f35663c9c628ecd5ef562c..8b0bbde98e6a1dee8ade789328f3ba0624049562 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -7,13 +7,15 @@ message NewProfileSessionRequest {
   ProfileRequest request = 1;
   string repository_root = 2;
   repeated string hosts = 3;
+  string session_id = 4;
 }
 
 message NewProfileSessionResponse {
   // Auxiliary error_message.
   string error_message = 1;
-  // If success, return session identifier for future reference.
-  string session_id = 2;
+
+  // Whether all hosts had returned a empty trace.
+  bool empty_trace = 2;
 }
 
 message EnumProfileSessionsAndToolsRequest {
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 3f2db548ace9e10df7844d8fb461670d27234670..a1690dadffe5770af9416a7c5ad3a7e336f6bc18 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,6 +25,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -56,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -121,8 +124,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -136,6 +147,143 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an
+          # outside_compilation cluster C in a forward computation we
+          # would like to put the ops corresponding to the gradient of
+          # X into a new outside_compilation cluster C'. However, if
+          # we take the gradient of X twice, the second one should get
+          # yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is
+          # the cluster that X was in before we took gradients, and a
+          # 'gradient_uid' which is different for every invocation of
+          # gradients, and put the gradient of X in cluster
+          # 'root_cluster.gradient_uid'.
+          #
+          # When the gradient code adds multiple Ops, it asks them to
+          # be colocated either with the original Op X, or with one of
+          # the preceding Ops that was added to the gradient. In other
+          # words, we want to detect the case where we are colocating
+          # with an Op that is in cluster root_cluster.gradient_uid
+          # and put the new Op in that same cluster if the
+          # gradient_uid is the same (the case that we are in the same
+          # invocation of gradients, and just adding new Ops to the
+          # cluster); and in a different cluster if the gradient_uids
+          # are different (the case that we are in a new invocation of
+          # gradients, taking the gradient of a previously-computed
+          # gradient).
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          if len(parts) > 1:
+            uid = parts[-1]
+            if uid == gradient_uid:
+              # Keep using the same cluster
+              cluster = outside_attr
+            else:
+              # We're taking the gradient of a gradient so make a new
+              # cluster attr, adding a new '.uid' on the end to
+              # preserve the invariant that the gradient_uid is the
+              # suffix after the last '.' in the attr.
+              cluster = outside_attr + "." + gradient_uid
+          else:
+            # We're taking the gradient of an Op in the forward pass, so
+            # make a new cluster combining the Op's cluster and the
+            # gradient id.
+            cluster = outside_attr + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        self._device = device.to_string()
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -157,9 +305,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
-    op.graph.prevent_feeding(op)
-    op.graph.prevent_fetching(op)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -181,6 +336,45 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
+def outside_compilation(computation, args=None):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    args: Inputs to pass to computation.
+  Returns:
+    The Tensors returned by computation.
+  """
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -280,7 +474,8 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(name=graph.unique_name("cluster"))
+  context = TPUReplicateContext(
+      name=graph.unique_name("cluster"), num_replicas=num_replicas)
   try:
     context.Enter()
 
@@ -361,6 +556,12 @@ def replicate(computation,
   finally:
     context.report_unsupported_operations()
     context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
   # Fan-out: Builds a TPUReplicatedOutput node for each output.
   outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 6834600b7919ff7c3a2f2e4b01e843b711329bbf..7fab19afeecc258c5185f219da2a11f3ffdad056 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -30,7 +30,6 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -38,6 +37,8 @@ from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.contrib.training.python.training import hparam
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -53,7 +54,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -73,6 +76,8 @@ _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
 
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
@@ -85,6 +90,13 @@ _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -1297,7 +1309,10 @@ class _ModelFnWrapper(object):
       batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
 
     if batch_size_for_model_fn is not None:
-      params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      if isinstance(params, hparam.HParams):
+        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
+      else:
+        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
@@ -1936,7 +1951,10 @@ class TPUEstimator(estimator_lib.Estimator):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        if isinstance(kwargs['params'], hparam.HParams):
+          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
+        else:
+          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -2006,6 +2024,13 @@ class TPUEstimator(estimator_lib.Estimator):
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
         if mode == model_fn_lib.ModeKeys.TRAIN:
           loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
@@ -2036,11 +2061,14 @@ class TPUEstimator(estimator_lib.Estimator):
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
 
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops),
+              train_op=train_op,
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 336d8260c3c8a5c30efa603e3faeabcc0944b8d0..c3882b8a27bc835f906c47dc5219f280c53800b8 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context")
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 4bb53e867811b27dc95857cfdfe936dd2e3b5c6e..f7fd66d33fc0c329db7daaf87373385156d84217 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -138,7 +138,6 @@ from __future__ import print_function
 
 import time
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
@@ -298,7 +297,7 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
   def begin(self):
     if self._replace_summary_op:
       self._summary_op = summary.merge_all()
-    self._global_step = variables.get_or_create_global_step()
+    self._global_step = training_util.get_or_create_global_step()
 
   def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 95e051e3b5bb9f8075e66891a45c64a27bca68d1..6c59b68053cfc6c1aebfca149bfba583d645a1e7 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -315,7 +315,7 @@ class HParams(object):
 
   Hyperparameters have type, which is inferred from the type of their value
   passed at construction type.   The currently supported types are: integer,
-  float, string, and list of integer, float, or string.
+  float, boolean, string, and list of integer, float, boolean, or string.
 
   You can override hyperparameter values by calling the
   [`parse()`](#HParams.parse) method, passing a string of comma separated
@@ -630,6 +630,9 @@ class HParams(object):
   def __str__(self):
     return str(sorted(self.values().items()))
 
+  def __repr__(self):
+    return '%s(%s)' % (type(self).__name__, self.__str__())
+
   @staticmethod
   def _get_kind_name(param_type, is_list):
     """Returns the field name given parameter type and is_list.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7d5ae1c5b57bef0d1958eaf13c6b2fdb0cceefc9..3882377d3d9695cc2a2da946ee3d9fd586831529 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -149,6 +149,7 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 
 exports_files(["ops/ops.pbtxt"])
 
@@ -208,6 +209,7 @@ CORE_PROTO_SRCS = [
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
+    "protobuf/checkpointable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
@@ -244,6 +246,21 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+proto_library(
+    name = "example_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -256,7 +273,7 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# Minimal lib to detect plafrom
+# Minimal lib to detect platform
 cc_library(
     name = "lib_platform",
     hdrs = [
@@ -264,6 +281,55 @@ cc_library(
     ],
 )
 
+PLATFORM_BASE_HDRS = [
+    "platform/logging.h",
+    "platform/macros.h",
+    "platform/types.h",
+    "platform/cpu_info.h",
+]
+
+PLATFORM_OTHER_HDRS = [
+    "platform/abi.h",
+    "platform/stacktrace.h",
+    "platform/stacktrace_handler.h",
+    "platform/context.h",
+    "platform/cpu_feature_guard.h",
+    "platform/dynamic_annotations.h",
+    "platform/env.h",
+    "platform/env_time.h",
+    "platform/file_system.h",
+    "platform/file_system_helper.h",
+    "platform/fingerprint.h",
+    "platform/init_main.h",
+    "platform/mem.h",
+    "platform/mutex.h",
+    "platform/net.h",
+    "platform/notification.h",
+    "platform/null_file_system.h",
+    "platform/prefetch.h",
+    "platform/profile_utils/clock_cycle_profiler.h",
+    "platform/profile_utils/cpu_utils.h",
+    "platform/protobuf.h",
+    "platform/strong_hash.h",
+    "platform/subprocess.h",
+    "platform/thread_annotations.h",
+]
+
+# Smaller platform libraries that don't depend on "lib" or "lib_internal".
+cc_library(
+    name = "platform_base",
+    srcs = glob([
+        "platform/*/integral_types.h",
+        "platform/*/logging.h",
+        "platform/*/cpu_info.h",
+    ]),
+    hdrs = PLATFORM_BASE_HDRS,
+    deps = [
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:base",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -294,7 +360,8 @@ cc_library(
 # tf_cc_test and tf_cc_binary will include the necessary symbols.
 cc_library(
     name = "lib",
-    hdrs = [
+    hdrs = PLATFORM_BASE_HDRS +
+           PLATFORM_OTHER_HDRS + [
         "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
@@ -341,34 +408,6 @@ cc_library(
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
-        "platform/abi.h",
-        "platform/context.h",
-        "platform/cpu_feature_guard.h",
-        "platform/cpu_info.h",
-        "platform/dynamic_annotations.h",
-        "platform/env.h",
-        "platform/env_time.h",
-        "platform/file_system.h",
-        "platform/file_system_helper.h",
-        "platform/fingerprint.h",
-        "platform/init_main.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/null_file_system.h",
-        "platform/prefetch.h",
-        "platform/profile_utils/clock_cycle_profiler.h",
-        "platform/profile_utils/cpu_utils.h",
-        "platform/protobuf.h",
-        "platform/stacktrace.h",
-        "platform/strong_hash.h",
-        "platform/subprocess.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "platform/windows/cpu_info.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -415,6 +454,17 @@ cc_library(
     ],
 )
 
+# Libraries that will eventually be moved into lib/core
+# Note that stringpiece_test can't be place here yet, because we are
+# required to use tf_cc_test, and that rule will change / into _
+cc_library(
+    name = "core_stringpiece",
+    srcs = ["lib/core/stringpiece.cc"],
+    hdrs = ["lib/core/stringpiece.h"],
+    copts = tf_copts(),
+    deps = [":platform_base"],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -442,6 +492,27 @@ cc_library(
     ] + tf_additional_test_deps(),
 )
 
+# Testing libraries - lite versions that don't depend on all of "lib" or
+# "lib_internal". Instead, they only need a much smaller set of support
+# libraries such as ":platform_base" and ":core_stringpiece".
+cc_library(
+    name = "test_lite",
+    testonly = 1,
+    srcs = [
+        "platform/test.cc",
+    ],
+    hdrs = [
+        "platform/test.h",
+        "platform/test_benchmark.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        "//tensorflow/core/platform/default/build_config:gtest",
+    ],
+)
+
 # This build rule (along with :framework_internal, :lib, and :lib_internal)
 # purposefully omits the definitions of many declared symbols, which are
 # included in //tensorflow:libtensorflow_framework.so. Using tf_cc_test and tf_cc_binary
@@ -499,7 +570,6 @@ tf_cuda_library(
         "framework/type_index.h",
         "framework/type_traits.h",
         "framework/types.h",
-        "framework/visitable_allocator.h",
         "public/version.h",
         "util/activation_mode.h",
         "util/bcast.h",
@@ -633,10 +703,13 @@ tf_gen_op_libs(
         "boosted_trees_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "collective_ops",
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
         "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
@@ -653,6 +726,7 @@ tf_gen_op_libs(
         "random_ops",
         "remote_fused_graph_ops",
         "resource_variable_ops",
+        "rpc_ops",
         "scoped_allocator_ops",
         "sdca_ops",
         "set_ops",
@@ -746,11 +820,14 @@ cc_library(
         ":boosted_trees_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
         ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
         ":dataset_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
@@ -767,6 +844,7 @@ cc_library(
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
         ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
@@ -888,11 +966,14 @@ cc_library(
         "//tensorflow/core/kernels:boosted_trees_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/kernels:decode_proto_op",
+        "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:functional_ops",
@@ -914,6 +995,7 @@ cc_library(
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/kernels:rpc_op",
         "//tensorflow/core/kernels:scoped_allocator_ops",
         "//tensorflow/core/kernels:sdca_ops",
         "//tensorflow/core/kernels:set_kernels",
@@ -987,6 +1069,7 @@ cc_library(
     hdrs = [
         "common_runtime/function_testlib.h",
         "common_runtime/kernel_benchmark_testlib.h",
+        "common_runtime/test_collective_executor_mgr.h",
         "framework/fake_input.h",
         "framework/function_testlib.h",
         "framework/shape_inference_testutil.h",
@@ -1641,6 +1724,7 @@ cc_library(
         exclude = [
             "**/*test*",
             "framework/variant.cc",
+            "lib/core/stringpiece.cc",
             "lib/hash/crc32c_accelerate.cc",
             "lib/gif/**/*",
             "lib/jpeg/**/*",
@@ -1654,6 +1738,7 @@ cc_library(
     ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
+            "lib/core/stringpiece.cc",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
@@ -1674,6 +1759,7 @@ cc_library(
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
         ":abi",
+        ":core_stringpiece",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -1905,7 +1991,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
     "framework/variant.h",
-    "framework/visitable_allocator.h",
     "platform/variant_coding.h",
     "util/command_line_flags.h",
     "util/env_var.h",
@@ -2182,18 +2267,19 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/buf_rendezvous.h",
+    "common_runtime/build_graph_options.h",
     "common_runtime/collective_executor_mgr.h",
     "common_runtime/collective_param_resolver_local.h",
     "common_runtime/collective_rma_local.h",
-    "common_runtime/device_resolver_local.h",
-    "common_runtime/buf_rendezvous.h",
-    "common_runtime/build_graph_options.h",
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
     "common_runtime/device_factory.h",
+    "common_runtime/device_resolver_local.h",
     "common_runtime/device_set.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
@@ -2204,18 +2290,21 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
     "common_runtime/renamed_device.h",
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
+    "common_runtime/ring_reducer.h",
     "common_runtime/scoped_allocator.h",
     "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
-    "common_runtime/placer.h",
+    "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
+    "common_runtime/visitable_allocator.h",
     "graph/gradients.h",
     "graph/quantize_training.h",
 ] + if_mkl(["graph/mkl_graph_util.h"])
@@ -2225,6 +2314,7 @@ tf_cuda_library(
     srcs = [
         "common_runtime/accumulate_n_optimizer.cc",
         "common_runtime/allocator_retry.cc",
+        "common_runtime/base_collective_executor.cc",
         "common_runtime/bfc_allocator.cc",
         "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
@@ -2255,6 +2345,7 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_reducer.cc",
         "common_runtime/scoped_allocator.cc",
         "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
@@ -2617,6 +2708,23 @@ cc_library(
     alwayslink = 1,
 )
 
+# This is the lite version of a main() for tests. It does not include any
+# support for reporting benchmark results when running on TPUs.
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    srcs = ["platform/test_main.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":core_stringpiece",
+        ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
+        "//tensorflow/core/platform/default/build_config:test_lite_main",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
@@ -3003,6 +3111,34 @@ tf_cc_test(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "ring_reducer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_reducer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index b1921e3507bb1a6e8f175305400e4bfbad068d38..62876a293c1656e267e37d8405899e35816f31ec 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "node_id_range"
     description: <<END
-A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
 END
   }
   in_arg {
@@ -84,4 +84,4 @@ In this manner, the output is the best split per features and per node, so that
 The length of output lists are all of the same length, `num_features`.
 The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 END
-}
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
index ef45a92498dadb9b911fbb99a1365f81a72060f2..4377125224979a4a499750af19267005c0f19e59 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -31,5 +31,13 @@ END
 The number of layers we attempted to build (but not necessarily succeeded).
 END
   }
-  summary: "Retrieves the tree ensemble resource stamp token."
-}
+  out_arg {
+    name: "last_layer_nodes_range"
+    description: <<END
+Rank size 2 tensor that contains start and end ids of the nodes in the latest
+layer.
+END
+
+  }
+  summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..763760176a722051640d4497e280e11f871f8011
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BroadcastTo"
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor to broadcast.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+An 1-D `int` Tensor. The shape of the desired output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor.
+END
+  }
+  summary: "Broadcast an array for a compatible shape."
+  description: <<END
+Broadcasting is the process of making arrays to have compatible shapes
+for arithmetic operations. Two shapes are compatible if for each
+dimension pair they are either equal or one of them is one. When trying
+to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+and works its way forward.
+
+For example,
+```
+>>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..803d8970ab78de347936a8dbbd2f39d8d9915f1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ClipByValue"
+  in_arg {
+    name: "t"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "clip_value_min"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The minimum value to clip by.
+END
+  }
+  in_arg {
+    name: "clip_value_max"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The maximum value to clip by.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A clipped `Tensor` with the same shape as input 't'.
+END
+  }
+  summary: "Clips tensor values to a specified min and max."
+  description: <<END
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88049bca365f4a738cac9975d0e14340e1ae401d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  visibility: SKIP
+  summary: "Receives a tensor value broadcast from another device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ff70f5b178af117e694cf7e998423b5ea58ac5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  visibility: SKIP
+  summary: "Broadcasts a tensor value to one or more other devices."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10d9771d46d408d9c0414dab4ae5954a75dfc47e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  visibility: SKIP
+  summary: "Mutually reduces multiple tensors of identical type and shape."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8152f53c4ded035140abd24ba006bf391641cf1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,116 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  in_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  out_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+Each entry is the number of values found for the corresponding field.
+Optional fields may have 0 or 1 values.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+`values[i]` has datatype `output_types[i]`
+and shape `[batch_shape, max(sizes[...,i])]`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+List of TF types to use for the respective field in field_names.
+END
+  }
+  attr {
+    name: "descriptor_source"
+    description: <<END
+Either the special value `local://` or a path to a file containing
+a serialized `FileDescriptorSet`.
+END
+  }
+  attr {
+    name: "message_format"
+    description: <<END
+Either `binary` or `text`.
+END
+  }
+  attr {
+    name: "sanitize"
+    description: <<END
+Whether to sanitize the result or not.
+END
+  }
+  summary: <<END
+The op extracts fields from a serialized protocol buffers message into tensors.
+END
+  description: <<END
+The `decode_proto` op extracts fields from a serialized protocol buffers
+message into tensors.  The fields in `field_names` are decoded and converted
+to the corresponding `output_types` if possible.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+Each output tensor is a dense tensor. This means that it is padded to
+hold the largest number of repeated elements seen in the input
+minibatch. (The shape is also padded by one to prevent zero-sized
+dimensions). The actual repeat counts for each example in the
+minibatch can be found in the `sizes` output. In many cases the output
+of `decode_proto` is fed immediately into tf.squeeze if missing values
+are not a concern. When using tf.squeeze, always pass the squeeze
+dimension explicitly to avoid surprises.
+
+For the most part, the mapping between Proto field types and
+TensorFlow dtypes is straightforward. However, there are a few
+special cases:
+
+- A proto field that contains a submessage or group can only be converted
+to `DT_STRING` (the serialized submessage). This is to reduce the
+complexity of the API. The resulting string can be used as input
+to another instance of the decode_proto op.
+
+- TensorFlow lacks support for unsigned integers. The ops represent uint64
+types as a `DT_INT64` with the same twos-complement bit pattern
+(the obvious way). Unsigned int32 values can be represented exactly by
+specifying type `DT_INT64`, or using twos-complement if the caller
+specifies `DT_INT32` in the `output_types` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+Both binary and text proto serializations are supported, and can be
+chosen using the `format` attribute.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe0fc3823ff724641298c03f74c115dd6211f385
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "DeepCopy"
+  in_arg {
+    name: "x"
+    description: "The source tensor of type `T`."
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+    y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+      is not an alias of `x`.
+END
+  }
+  summary: "Makes a copy of `x`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..746f561e9251c34c327123efd349bfb57682d7aa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Empty"
+  in_arg {
+    name: "shape"
+    description: "1-D. Represents the shape of the output tensor."
+  }
+  attr {
+    name: "init"
+    description:
+        "If True, initialize the returned tensor with the default value "
+        "of dtype.  Otherwise, the implementation is free not to initialize"
+        "the tensor's content."
+  }
+  out_arg {
+    name: "output"
+    description: "A `Tensor` of type `T`."
+  }
+  summary: <<END
+Creates a tensor with the given shape.
+
+This operation creates a tensor of `shape` and `dtype`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fdbe47f23615a1bbac30346d46241c4d321bc649
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "EncodeProto"
+  in_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+END
+  }
+  out_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "Tinput_types"
+    description: <<END
+The input types.
+END
+  }
+  summary: <<END
+The op serializes protobuf messages provided in the input tensors.
+END
+  description: <<END
+The types of the tensors in `values` must match the schema for the
+fields specified in `field_names`. All the tensors in `values` must
+have a common shape prefix, *batch_shape*.
+
+The `sizes` tensor specifies repeat counts for each field.  The repeat
+count (last dimension) of a each tensor in `values` must be greater
+than or equal to corresponding repeat count in `sizes`.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+There are a few special cases in the value mapping:
+
+Submessage and group fields must be pre-serialized as TensorFlow strings.
+
+TensorFlow lacks support for unsigned int64s, so they must be
+represented as `tf.int64` with the same twos-complement bit pattern
+(the obvious way).
+
+Unsigned int32 values can be represented exactly with `tf.int64`, or
+with sign wrapping if the input is of type `tf.int32`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3654286cc350995f8bed497cd662fce3b4150872
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceAdd"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Adds v into specified rows of x.
+
+    Computes y = x; y[i, :] += v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9480b4a3837893395168785c5b5b9ba74b643d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceSub"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Subtracts `v` into specified rows of `x`.
+
+    Computes y = x; y[i, :] -= v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd3659dc771077d34d2ba833a40e7d6be68f53
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  in_arg {
+    name: "x"
+    description: "A tensor of type `T`."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Updates specified rows with values in `v`.
+
+    Computes `x[i, :] = v; return x`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..344ef191fd580657acd5ebf75c3b5969f1af1fd2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
@@ -0,0 +1,108 @@
+op {
+  graph_op_name: "Rpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the RPC method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+If the connection fails or the remote worker returns an error
+status, the op reraises this exception locally.
+
+See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e6c1fddd6202be4380fa7994ac1e4e60338217
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+represents the unnormalized log probabilities for all classes.
+END
+  }
+  in_arg {
+    name: "num_samples"
+    description: <<END
+0-D.  Number of independent samples to draw for each row slice.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+contains the drawn class labels with range `[0, num_classes)`.
+END
+  }
+  summary: "Draws samples from a multinomial distribution."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c47208fa0525ccc7f91711bade66b0c86b914a7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGetItem"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f33d4926018f6ebad79c7e2e69fca9a1966eb5f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPushBackBatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..002e2a9bd37c2e6a2b41ba43237278bc42119bf7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListSetItem"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bded00e83c7de09da8eb06d353925a83bb4e7134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,123 @@
+op {
+  graph_op_name: "TryRpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  out_arg {
+    name: "status_code"
+    description: <<END
+Same shape as `request`.  Values correspond to tensorflow Status enum codes.
+END
+  }
+  out_arg {
+    name: "status_message"
+    description: <<END
+Same shape as `request`.  Values correspond to Status messages
+returned from the RPC calls.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+Unlike the standard `Rpc` op, if the connection fails or the remote worker
+returns an error status, this op does **not** reraise the exception.
+Instead, the `status_code` and `status_message` entry for the corresponding RPC
+call is set with the error returned from the RPC call.  The `response` tensor
+will contain valid response values for those minibatch entries whose RPCs did
+not fail; the rest of the entries will have empty strings.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..083eeced81dfc04d01c8721e3fb65727ef13176a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BroadcastTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cacdd5c2ca0838701aff1c085f06d81319612832
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ClipByValue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
deleted file mode 100644
index f0b7539918617e866acdf4d4d88279e1aeeb7a14..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DecodeCompressed"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2d5ed2b432d8ac5e60414409311308dcce7a486d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeepCopy"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b863520e987b69df680c84efcbdfca44518c6e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Empty"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..390e3bbf97340472608414af23ad5e6d8ee300ae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af9634f9b2b0cfe4e050005b8b05ca127d0523d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fa9d778ea6c937e7b8502b6db32d15bfa2ca90d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..867116c5da718f66205132d70a93c39464096df6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SlideDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6332fabdb3b1277884bbcadd67510869fb9dfd7
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -0,0 +1,257 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+#define VALUE_IN_DEBUG_STRING false
+
+namespace tensorflow {
+/*static*/
+int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                          int64 num_chunks) {
+  DCHECK_GT(num_chunks, 0);
+  int64 base_chunk_elts = (total_elts + (num_chunks - 1)) / num_chunks;
+  if (EIGEN_MAX_ALIGN_BYTES == 0) return base_chunk_elts;
+  if (EIGEN_MAX_ALIGN_BYTES <= elt_bytes) {
+    // Tolerate weird small values of EIGEN_MAX_ALIGN_BYTES
+    DCHECK_EQ(0, elt_bytes % EIGEN_MAX_ALIGN_BYTES);
+    return base_chunk_elts;
+  }
+  // elt_bytes < EIGEN_MAX_ALIGN_BYTES, which
+  // must be a common multiple of the various atomic data types.
+  DCHECK_EQ(0, EIGEN_MAX_ALIGN_BYTES % elt_bytes)
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " elt_bytes=" << elt_bytes;
+  // Round bytes per chunk up to the next multiple of EIGEN_MAX_ALIGN_BYTES.
+  int64 chunk_bytes = base_chunk_elts * elt_bytes;
+  int64 diff =
+      (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
+          ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
+          : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
+  CHECK_EQ(0, diff % elt_bytes);
+  base_chunk_elts += (diff / elt_bytes);
+  DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
+      << "total_elts=" << total_elts << " num_chunks=" << num_chunks
+      << " EIGEN_MAX_ALIGN_BYTES=" << EIGEN_MAX_ALIGN_BYTES
+      << " base_chunk_elts=" << base_chunk_elts << " elt_bytes=" << elt_bytes;
+  return base_chunk_elts;
+}
+
+namespace {
+template <typename T>
+class CollectiveAdapterImpl : public CollectiveAdapter {
+ public:
+  // Takes ownership of output and prepares to properly alias its chunks.
+  // Ownership is taken because the shape may temporarily change.
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+      : output_(std::move(*output)),
+        dt_(output_.dtype()),
+        old_shape_(output_.shape()),
+        num_chunks_(num_chunks),
+        allocator_(allocator),
+        total_elts_(output_.NumElements()),
+        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
+        data_end_(data_start_ + total_elts_) {
+    CHECK_GT(chunk_elts_, 0);
+    Flatten();
+  }
+
+  ~CollectiveAdapterImpl() override {}
+
+  const Tensor& Value() const override { return output_; }
+
+  // If necessary, flatten output.
+  void Flatten() {
+    if (old_shape_.dims() > 1) {
+      TensorShape new_shape = TensorShape({old_shape_.num_elements()});
+      DMAHelper::UnsafeSetShape(&output_, new_shape);
+    }
+  }
+
+  void ConsumeFinalValue(Tensor* output) override {
+    if (old_shape_ != output_.shape()) {
+      DMAHelper::UnsafeSetShape(&output_, old_shape_);
+    }
+    *output = std::move(output_);
+  }
+
+  // Number of T elements in a particular chunk.
+  inline int64 ChunkElts(int i) const {
+    DCHECK_LT(i, num_chunks_);
+    const T* chunk_start = std::min(data_end_, data_start_ + i * chunk_elts_);
+    const T* chunk_end = std::min(data_end_, chunk_start + chunk_elts_);
+    return chunk_end - chunk_start;
+  }
+
+  int64 ChunkBytes(int i) const override { return sizeof(T) * ChunkElts(i); }
+
+  // Returns a new Tensor that aliases the required chunk.
+  Tensor ChunkAlias(int i) override {
+    int64 start = chunk_elts_ * i;
+    int64 num_elts = ChunkElts(i);
+    // If this chunk is empty the prior chunk might also be short
+    // so always take an empty slice from the front of the tensor
+    // to avoid an illegal offset check failure somewhere.
+    return (num_elts > 0) ? output_.Slice(start, start + num_elts)
+                          : output_.Slice(0, 0);
+  }
+
+  Tensor TempChunk(int i) const override {
+    AllocationAttributes empty;
+    return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
+  }
+
+  string DebugString() const override {
+    return strings::StrCat(
+        "base addr ", reinterpret_cast<int64>(DMAHelper::base(&output_)),
+        " num_chunks ", num_chunks_, " total_elts ", total_elts_, " chunk_elts",
+        chunk_elts_, " value ",
+        VALUE_IN_DEBUG_STRING ? output_.SummarizeValue(1024) : "<hidden>");
+  }
+
+  string TBounds(const Tensor& t) const override {
+    int64 base_addr = reinterpret_cast<int64>(DMAHelper::base(&t));
+    return strings::StrCat("(", base_addr, ", ", (base_addr + t.TotalBytes()),
+                           ")");
+  }
+
+  Tensor Scalar(int v) const override {
+    Tensor t(dt_, TensorShape({}));
+    t.scalar<T>()() = v;
+    return t;
+  }
+
+  Tensor Scalar(Allocator* a) const override {
+    Tensor t(a, dt_, TensorShape({}));
+    return t;
+  }
+
+  Tensor output_;
+  const DataType dt_;
+  const TensorShape old_shape_;
+  const int64 num_chunks_;
+  Allocator* allocator_;
+  const int64 total_elts_;
+  const int64 chunk_elts_;
+  const T* data_start_;
+  const T* data_end_;
+};
+
+}  // namespace
+
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator) {
+  switch (output->dtype()) {
+    case DT_FLOAT:
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      break;
+    case DT_DOUBLE:
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      break;
+    case DT_INT32:
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      break;
+    case DT_INT64:
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type " << output->dtype()
+                 << " to MakeCollectiveAdapter";
+      return nullptr;
+  }
+}
+
+BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
+
+void BaseCollectiveExecutor::StartAbort(const Status& s) {
+  LOG(WARNING) << "BaseCollectiveExecutor::StartAbort " << s;
+  remote_access_->StartAbort(s);
+}
+
+void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
+                                          const CollectiveParams& col_params,
+                                          const string& exec_key,
+                                          StatusCallback done) {
+  const Tensor* input = &ctx->input(0);
+  Tensor* output = ctx->mutable_output(0);
+  string error;
+  switch (col_params.instance.type) {
+    case REDUCTION_COLLECTIVE: {
+      // TODO(tucker): support other reduction algorithms,
+      // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc.
+      RingReducer* reducer =
+          CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_,
+                        input, output, &error);
+      if (!reducer) {
+        done(errors::Internal(error));
+        return;
+      }
+      // Run in an I/O thread, so as not to starve the executor threads.
+      // TODO(tucker): Instead of forking every per-device Collective
+      // Op off into its own thread, consider queuing them on a
+      // fixed-size thread-pool dedicated to running CollectiveOps.
+      SchedClosure([reducer, done]() {
+        reducer->Run([reducer, done](const Status& s) {
+          done(s);
+          delete reducer;
+        });
+      });
+    } break;
+    case BROADCAST_COLLECTIVE:
+      done(errors::Internal("Collective Broadcast unimplemented"));
+      break;
+    default:
+      done(errors::Internal("Unimplemented CollectiveType ",
+                            col_params.instance.type));
+  }
+}
+
+RingReducer* BaseCollectiveExecutor::CreateReducer(
+    OpKernelContext* ctx, OpKernelContext::Params* params,
+    const CollectiveParams& col_params, const string& exec_key, int64 step_id,
+    const Tensor* input, Tensor* output, string* error) {
+  switch (col_params.instance.data_type) {
+    case DT_INT32:
+      if (col_params.group.device_type == DEVICE_GPU) {
+        *error =
+            "Collective Reduce does not support datatype DT_INT32 on "
+            "DEVICE_GPU";
+        return nullptr;
+      }
+      TF_FALLTHROUGH_INTENDED;
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT64:
+      return new RingReducer(this, dev_mgr_, ctx, params, col_params, exec_key,
+                             step_id, input, output);
+      break;
+    default:
+      *error = strings::StrCat("Collective Reduce does not support datatype ",
+                               col_params.instance.data_type);
+      return nullptr;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..58eaf31f71042b3105828bda745e96585fad52b9
--- /dev/null
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+class RingReducer;
+
+// Helper interface that aliases regular subfields of a Tensor as separate
+// Tensors for in-place update.
+class CollectiveAdapter {
+ public:
+  virtual ~CollectiveAdapter() {}
+
+  // Move the backing tensor to 'output' with its original storage and
+  // shape. After this call this CollectiveAdapter object should be
+  // deleted immediately without calling any of its other methods.
+  virtual void ConsumeFinalValue(Tensor* output) = 0;
+
+  // const access to entire intermediate value for debugging
+  virtual const Tensor& Value() const = 0;
+
+  // Returns tensor for chunk i which aliases the backing buffer.
+  virtual Tensor ChunkAlias(int i) = 0;
+
+  // Returns tensor allocated on the same device but with its own
+  // separate backing buffer.  Will have same type and size as
+  // chunk i.
+  virtual Tensor TempChunk(int i) const = 0;
+
+  // Bytes in chunk i
+  virtual int64 ChunkBytes(int i) const = 0;
+
+  // Generate a CPU RAM scalar tensor of the same DataType as the
+  // backing tensor with the given integer value.
+  virtual Tensor Scalar(int v) const = 0;
+
+  // Generate a scalar tensor of same DataType and on the same device
+  // as the backing tensor.
+  virtual Tensor Scalar(Allocator* a) const = 0;
+
+  // Debugging string describing buffer location
+  virtual string TBounds(const Tensor& t) const = 0;
+
+  virtual string DebugString() const = 0;
+
+  // Computes the number of elements per alias chunk tensor.
+  //
+  // A CHECK in tensor.cc expects that the memory buffer backing a
+  // Tensor will be aligned according to EIGEN_MAX_ALIGN_BYTES.  To
+  // ensure that all chunk aliasing Tensors maintain this alignment we
+  // need to pick a chunk size that preserves it.  Note than in extreme
+  // cases (impractical, but possible with very small tensors) one or
+  // more tail chunks can end up emptby.
+  static int64 AlignedChunkElts(int64 elt_bytes, int64 total_elts,
+                                int64 num_chunks);
+};
+
+// Create a CollectiveAdaptor wrapping 'output', specialized to its
+// data-type and shape.
+CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
+                                         Allocator* allocator);
+
+// Default implementation of CollectiveExecutor.  Delegates the actual
+// work of moving data to a class specialized for the operation type,
+// arguments and device+interconnect topology.
+class BaseCollectiveExecutor : public CollectiveExecutor {
+ public:
+  BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
+                         PerStepCollectiveRemoteAccess* remote_access,
+                         int64 step_id, const DeviceMgr* dev_mgr)
+      : CollectiveExecutor(cem),
+        step_id_(step_id),
+        dev_mgr_(dev_mgr),
+        remote_access_(remote_access) {}
+
+  ~BaseCollectiveExecutor() override;
+
+  void StartAbort(const Status& s) override;
+
+  void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
+                    const string& exec_key, StatusCallback done) override;
+
+  PerStepCollectiveRemoteAccess* remote_access() override {
+    return remote_access_.get();
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    remote_access_->RecvFromPeer(peer_device, peer_task, peer_is_local, key,
+                                 to_device, to_device_ctx, to_alloc_attr,
+                                 to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    remote_access_->PostToPeer(peer_device, peer_task, key, from_device,
+                               from_device_ctx, from_alloc_attr, from_tensor,
+                               client_locality, done);
+  }
+
+ protected:
+  const int64 step_id_;
+  const DeviceMgr* dev_mgr_;  // Not owned.
+  std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+
+ private:
+  RingReducer* CreateReducer(OpKernelContext* ctx,
+                             OpKernelContext::Params* params,
+                             const CollectiveParams& col_params,
+                             const string& exec_key, int64 step_id,
+                             const Tensor* input, Tensor* output,
+                             string* error);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BASE_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index e34945dd48a1e54e4ae82dd7ea9959f39a97f2c2..b8e773503c7a2f8024e8a6f58247ad343a762f71 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index a5c4946e58edf900ef3e42586f6b484d8f5e4891..e07829b286741e18db21e3c491973ec8f4b973dc 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -21,39 +22,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
-namespace {
-// TODO(tucker): Temporary class just until a real CollectiveExecutor
-// implementation is submitted in a later CL.
-class DummyCollectiveExecutor : public CollectiveExecutor {
- public:
-  explicit DummyCollectiveExecutor(CollectiveExecutorMgr* ce_mgr)
-      : CollectiveExecutor(ce_mgr) {}
-
-  ~DummyCollectiveExecutor() override {}
-
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality,
-                    const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DummyCollectiveExecutor);
-};
-}  // namespace
 
 CollectiveExecutorMgr::CollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
@@ -77,7 +45,9 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
     if (it != executor_table_.end()) {
       ce = it->second;
     } else {
-      ce = new DummyCollectiveExecutor(this);
+      CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
+          dev_mgr_, dev_resolver_.get(), step_id);
+      ce = new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
       executor_table_[step_id] = ce;
     }
     ce->Ref();
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index f95cecfc66785b9aa89d95bdbb916c36ef167f71..8ddc9958b2259f4da6dc1750c6c79a706c804be8 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -81,6 +81,7 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     test::FillValues<float>(&a_tensor, a_values);
     Node* a = test::graph::Constant(&graph, a_tensor);
     a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+    a_ = a->name();
 
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
@@ -97,12 +98,18 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     y_neg_ = y_neg->name();
     y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
 
+    Node* z = test::graph::Unary(&graph, "Identity", y_neg);
+    z_ = z->name();
+    z->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
+
     test::graph::ToGraphDef(&graph, &def_);
   }
 
+  string a_;
   string x_;
   string y_;
   string y_neg_;
+  string z_;
   GraphDef def_;
 };
 
@@ -133,7 +140,6 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
 
   // Run the test twice to ensure that the Make/Run/Release cycle is hermetic.
   for (int i = 0; i < 2; ++i) {
@@ -175,6 +181,159 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
   }
 }
 
+TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);".
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(1, outputs.size());
+    auto mat = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);"; also fetch the result of a.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(a_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(2, outputs.size());
+    auto mat_a = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(3.0, mat_a(0, 0));
+    EXPECT_FLOAT_EQ(2.0, mat_a(0, 1));
+    EXPECT_FLOAT_EQ(-1.0, mat_a(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_a(1, 1));
+
+    auto mat_y_neg = outputs[1].matrix<float>();
+    ASSERT_TRUE(outputs[1].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat_y_neg(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat_y_neg(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat_y_neg(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_y_neg(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Wire the output of "Neg(Matmul(a, x))" to the output of "a",
+    // creating an invalid cycle.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(y_ + ":0");
+    c->set_to_tensor(a_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "would create a cycle"));
+  }
+
+  {
+    // Attempt to wire a non-existent node to a node that does exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor("unknown_node:0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown node"));
+  }
+
+  {
+    // Attempt to wire a non-existent output from a node that does
+    // exist to another node.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":17");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown edge"));
+  }
+
+  {
+    // Attempt to wire a tensor to a node that doesn't exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor("unknown_node:0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsNotFound(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "unable to find feed output"));
+  }
+
+  {
+    // Attempt to wire two tensors to the same tensor.
+    CallableOptions callable_options;
+    TensorConnection* c1 = callable_options.add_tensor_connection();
+    c1->set_from_tensor(a_ + ":0");
+    c1->set_to_tensor(y_neg_ + ":0");
+    TensorConnection* c2 = callable_options.add_tensor_connection();
+    c2->set_from_tensor(x_ + ":0");
+    c2->set_to_tensor(y_neg_ + ":0");
+    callable_options.add_fetch(z_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  }
+
+  {
+    // Attempt to wire a tensor to a tensor that is also being fed.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_feed(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  }
+}
+
 TEST_F(DirectSessionMinusAXTest, TestFeed) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
@@ -654,6 +813,55 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
+TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0, 3.0, 4.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+
+  Tensor dummy_tensor(DT_FLOAT, TensorShape({1}));
+  test::FillValues<float>(&dummy_tensor, {-1.0});
+
+  Node* left = test::graph::Constant(&graph, dummy_tensor);
+  Node* right = test::graph::Constant(&graph, dummy_tensor);
+
+  // y = A * x
+  Node* y = test::graph::Add(&graph, left, right);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  CallableOptions callable_options;
+  // Directly wire the output of node a to the outputs of nodes left
+  // and right, making the callable graph into "a + a;".
+  TensorConnection* c_left = callable_options.add_tensor_connection();
+  c_left->set_from_tensor(a->name() + ":0");
+  c_left->set_to_tensor(left->name() + ":0");
+  TensorConnection* c_right = callable_options.add_tensor_connection();
+  c_right->set_from_tensor(a->name() + ":0");
+  c_right->set_to_tensor(right->name() + ":0");
+
+  callable_options.add_fetch(y->name() + ":0");
+
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(1, outputs.size());
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(2.0, mat(0, 0));
+  EXPECT_FLOAT_EQ(4.0, mat(0, 1));
+  EXPECT_FLOAT_EQ(6.0, mat(1, 0));
+  EXPECT_FLOAT_EQ(8.0, mat(1, 1));
+  TF_ASSERT_OK(session->ReleaseCallable(handle));
+}
+
 TEST(DirectSessionTest, FetchMultipleTimes) {
   Graph g(OpRegistry::Global());
   Tensor seven_tensor(DT_INT32, TensorShape());
diff --git a/tensorflow/core/common_runtime/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h
index 1cc8b9e723a26ba5848291a75029871d559a6b5b..cdfce1f366be66785a63a169c2107c2aaede1396 100644
--- a/tensorflow/core/common_runtime/dma_helper.h
+++ b/tensorflow/core/common_runtime/dma_helper.h
@@ -28,6 +28,9 @@ class DMAHelper {
   static void* base(Tensor* t) { return t->base<void>(); }
   static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
   static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
+  static void UnsafeSetShape(Tensor* t, const TensorShape& s) {
+    t->set_shape(s);
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 0a586344ccf2228a23059d68e7aa2d7a8f4eadba..208697361d2dfc4f3b8290ea511d15c9bd86857b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 63ed0b8be16ecb187113311db5283c8d4f3b1a5e..b0ca7e31096de836921a2c469fc7201738059d83 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -85,8 +85,8 @@ GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
 
 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   num_bytes += (2 * MASK_BYTES);
-
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Return the pointer after the header
   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
@@ -102,11 +102,13 @@ void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return rv;
 }
 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
-  CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
-  CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
+  if (ptr != nullptr) {
+    CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
+    CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
 
-  // Backtrack to the beginning of the header.
-  ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+    // Backtrack to the beginning of the header.
+    ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+  }
   // Deallocate the memory
   base_allocator_->DeallocateRaw(ptr);
 }
@@ -168,10 +170,12 @@ GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
 
 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Initialize the buffer to Nans
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
+  std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                          std::nanf(""));
   gpu::DeviceMemory<float> nan_ptr{
       gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
@@ -182,13 +186,16 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return allocated_ptr;
 }
 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
-  // Reset the buffer to Nans
-  size_t req_size = base_allocator_->RequestedSize(ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  if (ptr != nullptr) {
+    // Reset the buffer to Nans
+    size_t req_size = base_allocator_->RequestedSize(ptr);
+    std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                            std::nanf(""));
+    gpu::DeviceMemory<float> nan_ptr{
+        gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+      LOG(ERROR) << "Could not initialize to NaNs";
+    }
   }
 
   // Deallocate the memory
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0db08dc9759c9306ebd99b4acf4967128ef04895..adce3a84368ced958002443721016778cb6df028 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 38d669ea07c91bc1a892ecf925b3141f2ca506dd..91ce830df8521e7fe8284dd3c52d1bbf667891cd 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 2f17af273ff8cdc83a112ef350fde88346c7e13d..6a3e6906a3e3f076fd3b4731e36f86a1960742a6 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -237,6 +239,50 @@ void GraphExecutionState::RestoreStatefulNodes(Graph* graph) {
   }
 }
 
+namespace {
+
+class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
+ public:
+  TensorConnectionPruneRewrite(const string* endpoint_name,
+                               NodeBuilder::NodeOut from_tensor)
+      : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
+        from_tensor_(std::move(from_tensor)) {}
+
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override {
+    Status s;
+    auto check_no_cycle_fn = [this, feed_tensor, &s](Node* n) {
+      if (n == feed_tensor.node) {
+        s.Update(errors::InvalidArgument(
+            "Requested Tensor connection between nodes \"",
+            feed_tensor.node->name(), "\" and \"", from_tensor_.node->name(),
+            "\" would create a cycle."));
+      }
+    };
+    ReverseDFSFrom(*g, {from_tensor_.node}, std::move(check_no_cycle_fn),
+                   nullptr);
+    TF_RETURN_IF_ERROR(s);
+
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                    feed_tensor.index),
+                    "Identity")
+            .Input(from_tensor_)
+            .Attr("T",
+                  BaseType(from_tensor_.node->output_type(from_tensor_.index)))
+            .Finalize(g, out_node));
+
+    (*out_node)->set_assigned_device_name(
+        feed_tensor.node->assigned_device_name());
+    return Status::OK();
+  }
+
+ private:
+  NodeBuilder::NodeOut from_tensor_;
+};
+
+}  // namespace
+
 Status GraphExecutionState::PruneGraph(
     const BuildGraphOptions& options, Graph* graph,
     subgraph::RewriteGraphMetadata* out_rewrite_metadata) {
@@ -265,12 +311,48 @@ Status GraphExecutionState::PruneGraph(
           new subgraph::SendFetchRewrite(&fetch, device_info));
     }
   }
+
+  for (const TensorConnection& tensor_connection :
+       options.callable_options.tensor_connection()) {
+    Node* from_node = nullptr;
+    TensorId from_id(ParseTensorName(tensor_connection.from_tensor()));
+
+    for (Node* n : graph->nodes()) {
+      if (n->name() == from_id.first) {
+        from_node = n;
+        break;
+      }
+    }
+    if (from_node == nullptr) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown node: \"",
+          tensor_connection.to_tensor(), "\".");
+    }
+    if (from_id.second >= from_node->num_outputs()) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown edge: \"",
+          tensor_connection.to_tensor(),
+          "\" (actual number of outputs = ", from_node->num_outputs(), ").");
+    }
+
+    feed_rewrites.emplace_back(new TensorConnectionPruneRewrite(
+        &tensor_connection.to_tensor(), {from_node, from_id.second}));
+  }
+
   std::vector<string> target_node_names(
       options.callable_options.target().begin(),
       options.callable_options.target().end());
-  return subgraph::RewriteGraphForExecution(graph, feed_rewrites,
-                                            fetch_rewrites, target_node_names,
-                                            out_rewrite_metadata);
+  TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
+      graph, feed_rewrites, fetch_rewrites, target_node_names,
+      out_rewrite_metadata));
+
+  CHECK_EQ(out_rewrite_metadata->feed_types.size(),
+           options.callable_options.feed_size() +
+               options.callable_options.tensor_connection_size());
+  for (int i = 0; i < options.callable_options.tensor_connection_size(); ++i) {
+    out_rewrite_metadata->feed_types.pop_back();
+  }
+  return Status::OK();
 }
 
 Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
@@ -340,7 +422,13 @@ Status GraphExecutionState::OptimizeGraph(
                       options.callable_options.target().begin(),
                       options.callable_options.target().end());
 
-    if (!options.callable_options.feed().empty()) {
+    for (const TensorConnection& tensor_connection :
+         options.callable_options.tensor_connection()) {
+      item.fetch.push_back(tensor_connection.from_tensor());
+    }
+
+    if (!(options.callable_options.feed().empty() &&
+          options.callable_options.tensor_connection().empty())) {
       std::unordered_set<string> feeds;
       for (const string& feed : options.callable_options.feed()) {
         TensorId id = ParseTensorName(feed);
@@ -349,6 +437,15 @@ Status GraphExecutionState::OptimizeGraph(
         }
         feeds.insert(id.first.ToString());
       }
+      for (const TensorConnection& tensor_connection :
+           options.callable_options.tensor_connection()) {
+        TensorId id = ParseTensorName(tensor_connection.to_tensor());
+        if (id.second != 0) {
+          return errors::InvalidArgument("Unsupported feed: ",
+                                         tensor_connection.to_tensor());
+        }
+        feeds.insert(id.first.ToString());
+      }
       for (const NodeDef& node : original_graph_def_.node()) {
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 1125d2a34a5adcde5153ea4f039d0bda3159deb4..790f2eaa1e9de96b5cd399dd53a1e49696035f21 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(skyewm): this is necessary to make the single_threaded_cpu_device.h
+// include work. Some other include must be including eigen without defining
+// this. Consider defining in this in a BUILD rule.
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -20,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -36,18 +42,6 @@ namespace tensorflow {
 
 namespace {
 
-std::unique_ptr<Device> GetCPUDevice(Env* env) {
-  std::vector<Device*> devices;
-  SessionOptions session_options;
-  session_options.env = env;
-  Status s = DeviceFactory::GetFactory(DEVICE_CPU)
-                 ->CreateDevices(session_options, "", &devices);
-  if (s.ok() && !devices.empty()) {
-    return std::unique_ptr<Device>(devices[0]);
-  }
-  return nullptr;
-}
-
 // A simple rendezvous class.
 // Assumes a single sender and a single receiver, no duplicate sends, and no
 // sends of dead tensors.
@@ -98,7 +92,8 @@ class SimpleRendezvous : public Rendezvous {
 }  // namespace
 
 GraphRunner::GraphRunner(Env* env)
-    : device_deleter_(GetCPUDevice(env)), device_(device_deleter_.get()) {}
+    : device_deleter_(new SingleThreadedCpuDevice(env)),
+      device_(device_deleter_.get()) {}
 GraphRunner::GraphRunner(Device* device) : device_(device) {}
 
 GraphRunner::~GraphRunner() {}
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 829c19204af19119667fb455aad6505b388de94e..43a909466ed4b6fe6ea32b1ad72a1154390288ac 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr const char* MklCPUAllocator::kMaxLimitStr;
+constexpr const size_t MklCPUAllocator::kDefaultMaxLimit;
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 55c8411ad017dd8a2e64309bc426d96852a2a696..b2ef51d10b33caf3b6ad9bd494d574abbcceff55 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 92fdcb404e7ecc20e3079f1b21c37492daa5b047..d05f146f21ab247e090b110d875644dfd1fa3c96 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -144,7 +144,10 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU") return Status::OK();
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
+    // "TPU_SYSTEM" indicates that `device` is a CPU.
+    return Status::OK();
+  }
   if (device_type == "GPU") {
     auto* dev_info = flr->device()->tensorflow_gpu_device_info();
     if (dev_info) {
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 60263d1471943fed4392a4dc0704992589ce3b69..93f24a3217ef08fc7368365c9a43a913810f211b 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -121,27 +121,36 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
   // Recv the tensor from local_.
   local_->RecvAsync(
       parsed, recv_args,
-      [this, parsed, done](
-          const Status& status, const Rendezvous::Args& send_args,
-          const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-        // If "in" is an uninitialized tensor, do copy-construction to preserve
-        // the uninitialized state, along with data type and shape info, which
-        // is useful for debugger purposes.
-        Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
-
-        StatusCallback final_callback = [done, send_args, recv_args, out,
-                                         is_dead](const Status& s) {
-          done(s, send_args, recv_args, *out, is_dead);
-          delete out;
-        };
-
-        if (status.ok() && in.IsInitialized()) {
-          SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
-                             std::move(final_callback));
-        } else {
-          final_callback(status);
-        }
-      });
+      std::bind(
+          [this, parsed](DoneCallback done,
+                         // Begin unbound arguments.
+                         const Status& status,
+                         const Rendezvous::Args& send_args,
+                         const Rendezvous::Args& recv_args, const Tensor& in,
+                         bool is_dead) {
+            // If "in" is an uninitialized tensor, do copy-construction to
+            // preserve the uninitialized state, along with data type and shape
+            // info, which is useful for debugger purposes.
+            Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
+
+            auto final_callback = std::bind(
+                [send_args, recv_args, out, is_dead](DoneCallback done,
+                                                     // Begin unbound arguments.
+                                                     const Status& s) {
+                  done(s, send_args, recv_args, *out, is_dead);
+                  delete out;
+                },
+                std::move(done), std::placeholders::_1);
+
+            if (status.ok() && in.IsInitialized()) {
+              SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                                 std::move(final_callback));
+            } else {
+              final_callback(status);
+            }
+          },
+          std::move(done), std::placeholders::_1, std::placeholders::_2,
+          std::placeholders::_3, std::placeholders::_4, std::placeholders::_5));
 }
 
 void IntraProcessRendezvous::StartAbort(const Status& s) {
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79d03a24ced4ec5625e7713350849fc85b1e54f4
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -0,0 +1,542 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingReducer.
+string RingReduceBufKey(const string& exec_key, int pass, int section,
+                        int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(tucker): Try out some kind of denser encoding, e.g. 128 bit hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingReducer::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingReducer::RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+                         OpKernelContext* ctx,
+                         OpKernelContext::Params* op_params,
+                         const CollectiveParams& col_params,
+                         const string& exec_key, int64 step_id,
+                         const Tensor* input, Tensor* output)
+    : col_exec_(col_exec),
+      dev_mgr_(dev_mgr),
+      ctx_(ctx),
+      op_params_(op_params),
+      col_params_(col_params),
+      exec_key_(exec_key),
+      input_(input),
+      output_(output),
+      rank_(col_params.subdiv_rank[0]),
+      step_id_(step_id),
+      group_size_(col_params.group.group_size),
+      num_subdivs_(static_cast<int>(
+          col_params.instance.impl_details.subdiv_permutations.size())),
+      done_(nullptr),
+      device_(nullptr),
+      device_name_(
+          col_params_.instance.device_names[col_params_.default_rank]) {
+  CHECK_GT(group_size_, 0);
+  CHECK_GT(num_subdivs_, 0);
+}
+
+string RingReducer::TensorDebugString(Tensor tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      ctx_->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, device_, &cpu_tensor,
+        [&note](const Status& s) {
+          CHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingReducer::Run(StatusCallback done) {
+  done_ = std::move(done);
+
+  // Get local execution device.
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_.instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_.instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_.instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x : col_params_.instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingReducer::Run for device " << device_name_
+            << " default_rank " << col_params_.default_rank << "\n"
+            << buf;
+  }
+  CHECK(dev_mgr_);
+  Status status = dev_mgr_->LookupDevice(
+      col_params_.instance.device_names[col_params_.default_rank], &device_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to find device "
+               << col_params_.instance.device_names[col_params_.default_rank];
+    for (auto d : dev_mgr_->ListDevices()) {
+      LOG(ERROR) << "Available device " << d->name();
+    }
+    done_(status);
+    return;
+  }
+  CHECK(device_);
+  device_locality_ = device_->attributes().locality();
+
+  VLOG(1) << this << " default_rank " << col_params_.default_rank << " cp "
+          << &col_params_ << ": " << col_params_.ToString();
+
+  // Start by copying input to output if they're not already the same, i.e. if
+  // we're not computing in-place on the input tensor.
+  if ((input_ != output_) &&
+      (DMAHelper::base(input_) != DMAHelper::base(output_))) {
+    CollectiveRemoteAccessLocal::MemCpyAsync(
+        ctx_->input_device_context(0), ctx_->op_device_context(), device_,
+        device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_,
+        output_, [this](const Status& s) {
+          if (!s.ok()) {
+            done_(s);
+          } else {
+            ContinueAfterInputCopy();
+          }
+        });
+  } else {
+    ContinueAfterInputCopy();
+  }
+}
+
+void RingReducer::ContinueAfterInputCopy() {
+  AllocatorAttributes attr = ctx_->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_,
+                                  device_->GetAllocator(attr)));
+
+  if (col_params_.final_op) {
+    // Create an on-device scalar value from group_size_ that may be needed
+    // later.
+    // TODO(tucker): Cache and reuse across invocations? Or maybe the scalar
+    // can be provided to the kernel in host memory?
+    Tensor group_size_val = ca_->Scalar(group_size_);
+    if (col_params_.group.device_type != "CPU") {
+      group_size_tensor_ =
+          ca_->Scalar(device_->GetAllocator(ctx_->input_alloc_attr(0)));
+      DeviceContext* op_dev_ctx = ctx_->op_device_context();
+      op_dev_ctx->CopyCPUTensorToDevice(&group_size_val, device_,
+                                        &group_size_tensor_,
+                                        [this](const Status& s) {
+                                          if (!s.ok()) {
+                                            StartAbort(s);
+                                          }
+                                          group_size_tensor_ready_.Notify();
+                                        });
+    } else {
+      group_size_tensor_ = group_size_val;
+      group_size_tensor_ready_.Notify();
+    }
+  }
+  Finish(RunAsyncParts());
+}
+
+void RingReducer::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting RingReduce with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_exec_->StartAbort(s);
+  }
+}
+
+void RingReducer::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(output_);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  done_(s);
+}
+
+RingReducer::SubContext::SubContext(OpKernelContext* ctx,
+                                    OpKernelContext::Params* params,
+                                    OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
+}
+
+Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                                 Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
+  // mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(ctx_, op_params_, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_);
+  return sub_ctx->sub_ctx_->status();
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                                int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaenously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_.subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_.instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_.task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_.task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
+  }
+  if (rf->do_recv) {
+    rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
+    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingReducer::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  CHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingReducer::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_send);
+  string send_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_.default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_.instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_exec_->PostToPeer(col_params_.instance.device_names[send_to_dev_idx],
+                        col_params_.instance.task_names[send_to_dev_idx],
+                        send_buf_key, device_, ctx_->op_device_context(),
+                        ctx_->output_alloc_attr(0), &rf->chunk,
+                        device_locality_, done);
+}
+
+void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  CHECK(rf->do_recv);
+  string recv_buf_key =
+      RingReduceBufKey(exec_key_, rf->second_pass, rf->sc_idx,
+                       (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_.default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_.merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_.merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_exec_->RecvFromPeer(col_params_.instance.device_names[rf->recv_dev_idx],
+                          col_params_.instance.task_names[rf->recv_dev_idx],
+                          col_params_.task.is_local[rf->recv_dev_idx],
+                          recv_buf_key, device_, ctx_->op_device_context(),
+                          ctx_->output_alloc_attr(0), dst_tensor,
+                          device_locality_, done);
+}
+
+string RingReducer::FieldState() {
+  string s = strings::StrCat("RingReducer ",
+                             strings::Hex(reinterpret_cast<uint64>(this)),
+                             " exec ", exec_key_, " step_id=", step_id_,
+                             " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+bool RingReducer::RunAsyncParts() {
+  // This function orchestrates RingReduce actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+  field_done_count = 0;
+  send_pending_count = 0;
+  recv_pending_count = 0;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) break;
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          CHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          if (!rf->second_pass) {
+            rf->action = RF_REDUCE;
+            Status s = ComputeBinOp(device_, col_params_.merge_op.get(),
+                                    &rf->chunk, &rf->tmp_chunk);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_REDUCE:
+          if (!rf->second_pass && col_params_.final_op.get() && rf->is_final) {
+            rf->action = RF_FINALIZE;
+            group_size_tensor_ready_.WaitForNotification();
+            Status s = ComputeBinOp(device_, col_params_.final_op.get(),
+                                    &rf->chunk, &group_size_tensor_);
+            if (!s.ok()) {
+              aborted = true;
+              StartAbort(s);
+            }
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_FINALIZE:
+          rf->action = RF_DONE;
+          break;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          CHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        if (rf->second_pass) {
+          ++field_done_count;
+          break;  // from do while(!dispatched)
+        } else {
+          AdvanceToSecondPass(rf);
+        }
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {}  // Ignore any other actions
+      }
+    }
+  }
+
+  CHECK_EQ(send_pending_count, 0);
+  CHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " rank=" << rank_ << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fde18dc1c083fe1ed0b52ae9f211ed18ac123ef
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
+
+#include <deque>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Ring-algorithm implementation of collective all-reduce.
+class RingReducer {
+ public:
+  RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
+              OpKernelContext* ctx, OpKernelContext::Params* op_params,
+              const CollectiveParams& col_params, const string& exec_key,
+              int64 step_id, const Tensor* input, Tensor* output);
+
+  virtual ~RingReducer() {}
+
+  void Run(StatusCallback done);
+
+ private:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void ContinueAfterInputCopy();
+  void Finish(bool ok);
+  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
+                      Tensor* input);
+  bool RunAsyncParts();
+
+  // Used for executing a sub-operation, e.g. a merge_op instance, with
+  // an OpKernelContext based on the one passed into this Op.
+  class SubContext {
+   public:
+    OpKernelContext::Params sub_params_;
+    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+    // Used only for Binary and Unary Ops for which we require
+    // the calculation to be in-place on the first input.
+    int forward_from_ = 0;
+    OpKernelContext* sub_ctx_;
+    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+               OpKernel* op, Tensor* output, Tensor* input);
+    ~SubContext() { delete sub_ctx_; }
+  };
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  void AdvanceToSecondPass(RingField* rf);
+  void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                     int field_idx);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(Tensor tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  CollectiveExecutor* col_exec_;        // Not owned
+  const DeviceMgr* dev_mgr_;            // Not owned
+  OpKernelContext* ctx_;                // Not owned
+  OpKernelContext::Params* op_params_;  // Not owned
+  const CollectiveParams& col_params_;
+  const string exec_key_;
+  const Tensor* input_;  // Not owned
+  Tensor* output_;       // Not owned
+  const int rank_;
+  const int64 step_id_;
+  const int group_size_;
+  const int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  StatusCallback done_;
+  Device* device_;  // The device for which this instance labors
+  const string device_name_;
+  DeviceLocality device_locality_;
+
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_REDUCER_H_
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4387a074af79f97e17d1f9f1d828157b738fa40
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -0,0 +1,606 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_reducer.h"
+
+#include <algorithm>
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DataType dtype, const DeviceType& device_type,
+                                 DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", dtype)
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device_type, device);
+}
+
+static int64 kStepId = 123;
+
+class RingReducerTest : public ::testing::Test {
+ protected:
+  RingReducerTest() : device_type_(DEVICE_CPU) {}
+
+  void SetUp() override {
+#if GOOGLE_CUDA
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+#endif
+  }
+
+  ~RingReducerTest() override {
+    stop_ = true;
+    for (auto i : instances_) {
+      delete i;
+    }
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+    device_type_ = device_type;
+    std::vector<Device*> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(new ThreadPoolDevice(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(gpu_devices_[dev_idx]);
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(local_devices));
+    }
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Reduce() {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoReduce();
+        ++done;
+      });
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    std::vector<T> expected(tensor_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}), [&expected, dtype, di](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              if (dtype == DT_INT32 || dtype == DT_INT64) {
+                value = di * 10 + i;
+              }
+              t->flat<T>()(i) = static_cast<T>(value);
+              expected[i] += value;
+            }
+          });
+    }
+    Reduce();
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device computed the same correct reduction value.
+      for (int i = 0; i < tensor_len; ++i) {
+        expected[i] /= (num_workers * num_devices);
+      }
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({tensor_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        for (int i = 0; i < tensor_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Id")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingReducerTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoReduce() {
+      col_params_.merge_op =
+          GetAdd(col_params_.instance.data_type, device_type_, device_);
+      col_params_.final_op =
+          GetDiv(col_params_.instance.data_type, device_type_, device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveReduce(
+          col_params_, &tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the
+      // output allocation that it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a RingReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingReducer rr(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx,
+                     &op_params, col_params_, exec_key, kStepId, &tensor_,
+                     &tensor_);
+
+      // Start execution in a threadpool then wait for completion.
+      Notification notification;
+      SchedClosure([this, &notification, &rr]() {
+        rr.Run([this, &notification](Status s) {
+          status_ = s;
+          notification.Notify();
+        });
+      });
+      notification.WaitForNotification();
+      CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& tensor() { return tensor_; }
+
+    RingReducerTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<tensorflow::Device*> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingReducerTest,                                                     \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingReducer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 3, 1045991, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 4, 1045991, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 1b7e3138ee5073f48829ff55cba0108bd69785fc..06dbe049868b2f85e8ebcabe4df5cec2170486b4 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -431,6 +431,32 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   InferenceContext* src_context = GetContext(input_edge->src());
   if (src_context == nullptr) return errors::Internal("Missing src context");
   ShapeHandle src_shape = src_context->output(input_edge->src_output());
+
+  if (src_context->Value(src_context->Rank(src_shape)) == 0) {
+    Tensor t;
+    bool evaluated = false;
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    if (!evaluated) {
+      return errors::InvalidArgument(
+          "Received a shape scalar with unknown static value.  A static value "
+          "of '-1' is required to represent an unknown shape.");
+    }
+    if (t.dims() == 0) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      } else if (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      }
+    }
+    return errors::InvalidArgument(
+        "Received an invalid shape scalar with a static value that is not "
+        "'-1': ",
+        t.DebugString());
+  }
+
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
   const string& src_op = input_edge->src()->type_string();
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..04d5af9087059232097e7aebeb32141a3046ee63
--- /dev/null
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+class Env;
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.workers = new thread::ThreadPool(
+        env, "graph_runner", eigen_worker_threads_.num_threads);
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+    delete eigen_worker_threads_.workers;
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0d4f24b111ed340f754ff4ab77223e8b19d68ab
--- /dev/null
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// Mock objects that can't actually execute a Collective, but satisfy
+// general infrastructure expectations within tests that don't require
+// full functionality.
+
+class TestCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : CollectiveExecutor(cem) {}
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,  //???
+                    const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+};
+
+class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  TestCollectiveExecutorMgr() {}
+
+  ~TestCollectiveExecutorMgr() override {
+    for (auto& iter : table_) {
+      iter.second->Unref();
+    }
+  }
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override {
+    mutex_lock l(mu_);
+    CollectiveExecutor* ce = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      ce = iter->second;
+    } else {
+      ce = new TestCollectiveExecutor(this);
+      table_[step_id] = ce;
+    }
+    ce->Ref();
+    return ce;
+  }
+
+  void Cleanup(int64 step_id) override {
+    mutex_lock l(mu_);
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      iter->second->Unref();
+      table_.erase(iter);
+    }
+  }
+
+  ParamResolverInterface* GetParamResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    LOG(FATAL);
+    return nullptr;
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override {
+    done(errors::Internal("unimplemented"));
+  }
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+  mutex mu_;
+  gtl::FlatMap<int64, CollectiveExecutor*> table_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/framework/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
similarity index 94%
rename from tensorflow/core/framework/visitable_allocator.h
rename to tensorflow/core/common_runtime/visitable_allocator.h
index ed41b05531acaa1be803ac533854efe6160691b4..8edf922d11ee1662b78771bfdc7c38e0144aee19 100644
--- a/tensorflow/core/framework/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
 
 #include <functional>
 #include "tensorflow/core/framework/allocator.h"
@@ -76,4 +76,4 @@ class TrackingVisitableAllocator : public TrackingAllocator,
   VisitableAllocator* allocator_;
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index aaa4cfa7341c42bf9f7302e8ef30a28b68e6213c..76315462a738b7e70ce8c1f9ca5776d0037e22f9 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -157,6 +157,47 @@ Status LocalMaster::Reset(CallOptions* call_options,
   return ret;
 }
 
+Status LocalMaster::MakeCallable(CallOptions* call_options,
+                                 const MakeCallableRequest* request,
+                                 MakeCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->MakeCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::RunCallable(CallOptions* call_options,
+                                const RunCallableRequest* request,
+                                RunCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->RunCallable(call_options, request, response,
+                            [&n, &ret](const Status& s) {
+                              ret.Update(s);
+                              n.Notify();
+                            });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::ReleaseCallable(CallOptions* call_options,
+                                    const ReleaseCallableRequest* request,
+                                    ReleaseCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->ReleaseCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+
 namespace {
 mutex* get_local_master_registry_lock() {
   static mutex local_master_registry_lock(LINKER_INITIALIZED);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index c20b40329ab1712b3dd0cae673d337481ee40196..cad6babad82b9b2ac2953f5497e46bb471699b10 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -71,6 +71,16 @@ class LocalMaster : public MasterInterface {
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override;
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override;
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response);
+
   // Registers the mapping from the given `target` to the given `master`.
   //
   // WARNING: The `master` pointer remains owned by the caller. It is
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 1a488303ac73b8628b9d3fe4050ad9144724348e..f47502e844f70ed4005e2cd95220fc04341b8bc2 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -611,4 +611,55 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
   });
 }
 
+void Master::MakeCallable(const MakeCallableRequest* req,
+                          MakeCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->MakeCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                         RunCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, opts, req, resp](MyClosure done) {
+        Status s = session->RunCallable(opts, *req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::ReleaseCallable(const ReleaseCallableRequest* req,
+                             ReleaseCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->ReleaseCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 678fc46bd7f4644022c5811f8a1c7a9f606be111..dbb337fd484960fbd3bfe47d0bfe0497985de66f 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -61,6 +61,13 @@ class Master {
   // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
+  void MakeCallable(const MakeCallableRequest* req, MakeCallableResponse* resp,
+                    MyClosure done);
+  void RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                   RunCallableResponse* resp, MyClosure done);
+  void ReleaseCallable(const ReleaseCallableRequest* req,
+                       ReleaseCallableResponse* resp, MyClosure done);
+
  private:
   typedef Master ME;
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index bf6a2db3e27b301c01ca7d5073d175b24417220f..a8ae3cba3cdd3f02aae823d893e027b2bccae2c9 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -89,6 +89,16 @@ class MasterInterface {
   virtual Status Reset(CallOptions* call_options, const ResetRequest* request,
                        ResetResponse* response) = 0;
 
+  virtual Status MakeCallable(CallOptions* call_options,
+                              const MakeCallableRequest* request,
+                              MakeCallableResponse* response) = 0;
+  virtual Status RunCallable(CallOptions* call_options,
+                             const RunCallableRequest* request,
+                             RunCallableResponse* response) = 0;
+  virtual Status ReleaseCallable(CallOptions* call_options,
+                                 const ReleaseCallableRequest* request,
+                                 ReleaseCallableResponse* response) = 0;
+
  protected:
   // NOTE: This should only be called by implementations of this
   // interface whose CreateRunStepResponse() method returns a
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 64adf35c5e2bcac21f008a46848b886a954bec17..e0a5bb4c537095d2d56ee0844d67443e73e9756f 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -72,7 +72,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
-        debug_opts_(bopts.callable_options.run_options().debug_options()),
+        callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
         should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
@@ -94,12 +94,18 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
+  const CallableOptions& callable_options() { return callable_opts_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
   }
 
+  int64 get_and_increment_execution_count() {
+    return execution_count_.fetch_add(1);
+  }
+
   // Turn RPC logging on or off, both at the WorkerCache used by this
   // master process, and at each remote worker in use for the current
   // partitions.
@@ -178,6 +184,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                        CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
+  Status RunPartitions(const MasterEnv* env, int64 step_id,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* call_opts, const RunCallableRequest& req,
+                       RunCallableResponse* resp, CancellationManager* cm);
 
   // Calls workers to cleanup states for the step "step_id".  Calls
   // `done` when all cleanup RPCs have completed.
@@ -211,10 +221,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<ClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
-  const DebugOptions& debug_opts_;
+  const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
   const bool should_deregister_;
+  std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -269,6 +280,17 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       const PartitionOptions& popts,
       std::unordered_map<string, GraphDef> graph_partitions);
 
+  // Prepares a number of calls to workers. One call per partition.
+  // This is a generic method that handles Run, PartialRun, and RunCallable.
+  template <class FetchListType, class ClientRequestType,
+            class ClientResponseType>
+  Status RunPartitionsHelper(
+      const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+      const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+      int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+      const ClientRequestType& req, ClientResponseType* resp,
+      CancellationManager* cm, bool is_last_partial_run);
+
   // Deregisters the partitions on the workers.  Called in the
   // destructor and does not wait for the rpc completion.
   void DeregisterPartitions();
@@ -411,7 +433,8 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     c->req.set_session_handle(session_handle_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
-    *c->req.mutable_debug_options() = debug_opts_;
+    *c->req.mutable_debug_options() =
+        callable_opts_.run_options().debug_options();
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -490,24 +513,46 @@ class RunManyGraphs {
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
 };
 
-Status MasterSession::ReffedClientGraph::RunPartitions(
-    const MasterEnv* env, int64 step_id, int64 execution_count,
-    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
-    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
-    const bool is_last_partial_run) {
-  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
-          << execution_count;
-  // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+namespace {
+Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
+}
 
-  for (size_t i = 0; i < req.num_feeds(); ++i) {
-    if (!feeds.insert({req.feed_name(i), i}).second) {
-      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
-    }
-  }
+Status AddSendFromClientRequest(const RunCallableRequest& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
+}
 
-  // Prepares a number of calls to workers. One call per partition.
+// TODO(mrry): Add a full-fledged wrapper that avoids TensorProto copies for
+// in-process messages.
+struct RunCallableResponseWrapper {
+  RunCallableResponse* resp;  // Not owned.
+  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+
+  RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
 
+  Status AddTensorFromRunGraphResponse(
+      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
+      size_t index) {
+    // TODO(b/74355905): Add a specialized implementation that avoids
+    // copying the tensor into the RunCallableResponse when at least
+    // two of the {client, master, worker} are in the same process.
+    return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
+  }
+};
+}  // namespace
+
+template <class FetchListType, class ClientRequestType,
+          class ClientResponseType>
+Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
+    const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+    const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+    int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+    const ClientRequestType& req, ClientResponseType* resp,
+    CancellationManager* cm, bool is_last_partial_run) {
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
   if (pss->report_tensor_allocations_upon_oom) {
@@ -553,28 +598,19 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (size_t i = 0; i < req.num_feeds(); ++i) {
-        const string& name = req.feed_name(i);
-        const auto iter = part.feed_key.find(name);
+      for (const auto& name_index : feeds) {
+        const auto iter = part.feed_key.find(name_index.first.ToString());
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
         const string& key = iter->second;
-        auto feeds_iter = feeds.find(name);
-        if (feeds_iter == feeds.end()) {
-          return errors::InvalidArgument("No feed is provided for feed=", name,
-                                         ", key=", key);
-        } else if (feeds_iter->second != static_cast<size_t>(i)) {
-          return errors::Internal("Cannot find feed named \"", name,
-                                  " in request.");
-        }
-        TF_RETURN_IF_ERROR(c->req->AddSendFromRunStepRequest(req, i, key));
+        TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
+                                                    name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
-        const string& req_fetch = req.fetch_name(i);
+      for (const string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -586,9 +622,13 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       for (const auto& feed_key : part.feed_key) {
         const string& feed = feed_key.first;
         const string& key = feed_key.second;
-        const int64 feed_index = feeds[feed];
+        auto iter = feeds.find(feed);
+        if (iter == feeds.end()) {
+          return errors::Internal("No feed index found for feed: ", feed);
+        }
+        const int64 feed_index = iter->second;
         TF_RETURN_IF_ERROR(
-            c->req->AddSendFromRunStepRequest(req, feed_index, key));
+            AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
         const string& key = key_fetch.first;
@@ -622,50 +662,115 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   } else {
     return errors::Cancelled("Step was cancelled");
   }
+  TF_RETURN_IF_ERROR(calls.status());
 
-  // Collects fetches.
-  Status status = calls.status();
-  if (status.ok()) {
-    for (int i = 0; i < num; ++i) {
-      const Part& part = partitions_[i];
-      MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
-      for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
-        auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
-        if (iter == part.key_fetch.end()) {
-          status.Update(errors::Internal("Unexpected fetch key: ",
-                                         run_graph_resp->recv_key(j)));
-          break;
-        }
-        const string& fetch = iter->second;
-        status.Update(
-            resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
-        if (!status.ok()) {
-          break;
-        }
+  // Collects fetches and metadata.
+  Status status;
+  for (int i = 0; i < num; ++i) {
+    const Part& part = partitions_[i];
+    MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
+    for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
+      auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
+      if (iter == part.key_fetch.end()) {
+        status.Update(errors::Internal("Unexpected fetch key: ",
+                                       run_graph_resp->recv_key(j)));
+        break;
       }
-      if (pss->collect_timeline) {
-        pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+      const string& fetch = iter->second;
+      status.Update(
+          resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
+      if (!status.ok()) {
+        break;
       }
-      if (pss->collect_costs) {
-        CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
-        for (int j = 0; j < cost_graph->node_size(); ++j) {
-          resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
-              cost_graph->mutable_node(j));
-        }
+    }
+    if (pss->collect_timeline) {
+      pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+    }
+    if (pss->collect_costs) {
+      CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
+      for (int j = 0; j < cost_graph->node_size(); ++j) {
+        resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
+            cost_graph->mutable_node(j));
       }
-      if (pss->collect_partition_graphs) {
-        protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
-            resp->mutable_metadata()->mutable_partition_graphs();
-        for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
-          partition_graph_defs->Add()->Swap(
-              run_graph_resp->mutable_partition_graph(i));
-        }
+    }
+    if (pss->collect_partition_graphs) {
+      protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+          resp->mutable_metadata()->mutable_partition_graphs();
+      for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
+        partition_graph_defs->Add()->Swap(
+            run_graph_resp->mutable_partition_graph(i));
       }
     }
   }
   return status;
 }
 
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
+    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
+    const bool is_last_partial_run) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    if (!feeds.insert({req.feed_name(i), i}).second) {
+      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
+    }
+  }
+
+  std::vector<string> fetches;
+  fetches.reserve(req.num_fetches());
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    fetches.push_back(req.fetch_name(i));
+  }
+
+  return RunPartitionsHelper(feeds, fetches, env, step_id, execution_count, pss,
+                             call_opts, req, resp, cm, is_last_partial_run);
+}
+
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunCallableRequest& req,
+    RunCallableResponse* resp, CancellationManager* cm) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < callable_opts_.feed_size(); ++i) {
+    if (!feeds.insert({callable_opts_.feed(i), i}).second) {
+      // MakeCallable will fail if there are two feeds with the same name.
+      return errors::Internal("Duplicated feeds in callable: ",
+                              callable_opts_.feed(i));
+    }
+  }
+
+  // Create a wrapped response object to collect the fetched values and
+  // rearrange them for the RunCallableResponse.
+  RunCallableResponseWrapper wrapped_resp;
+  wrapped_resp.resp = resp;
+
+  TF_RETURN_IF_ERROR(RunPartitionsHelper(
+      feeds, callable_opts_.fetch(), env, step_id, execution_count, pss,
+      call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
+
+  // Collects fetches.
+  // TODO(b/74355905): Add a specialized implementation that avoids
+  // copying the tensor into the RunCallableResponse when at least
+  // two of the {client, master, worker} are in the same process.
+  for (const string& fetch : callable_opts_.fetch()) {
+    TensorProto* fetch_proto = resp->mutable_fetch()->Add();
+    auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
+    if (iter == wrapped_resp.fetch_key_to_protos.end()) {
+      return errors::Internal("Worker did not return a value for fetch: ",
+                              fetch);
+    }
+    fetch_proto->Swap(&iter->second);
+  }
+  return Status::OK();
+}
+
 namespace {
 
 class CleanupBroadcastHelper {
@@ -1266,15 +1371,11 @@ WorkerCacheInterface* MasterSession::get_worker_cache() const {
   return env_->worker_cache;
 }
 
-Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
-                                ReffedClientGraph** rcg, bool is_partial) {
+Status MasterSession::StartStep(const BuildGraphOptions& opts, bool is_partial,
+                                ReffedClientGraph** out_rcg, int64* out_count) {
   const uint64 hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
-    // Keep track of how many times this subgraph has been executed in
-    // this session.
-    int64* c = &subgraph_execution_counts_[hash];
-    *count = (*c)++;
     // TODO(suharshs): We cache partial run graphs and run graphs separately
     // because there is preprocessing that needs to only be run for partial
     // run calls.
@@ -1296,8 +1397,9 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
-    *rcg = iter->second;
-    (*rcg)->Ref();
+    *out_rcg = iter->second;
+    (*out_rcg)->Ref();
+    *out_count = (*out_rcg)->get_and_increment_execution_count();
   }
   return Status::OK();
 }
@@ -1316,6 +1418,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
+namespace {
+uint64 MakeStepId() {
+  return (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+}
+}  // namespace
+
 Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                       PartialRunSetupResponse* resp) {
   std::vector<string> inputs, outputs, targets;
@@ -1332,15 +1440,15 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
 
   // Prepare.
   BuildGraphOptions opts;
   BuildBuildGraphOptions(*req, &opts);
-  TF_RETURN_IF_ERROR(StartStep(opts, &count, &rcg, true));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(opts, true, &rcg, &count));
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
   rcg->Ref();
@@ -1585,6 +1693,73 @@ Status MasterSession::CreateDebuggerState(
   return Status::OK();
 }
 
+void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                                     const RunOptions& run_options,
+                                     uint64 step_id, int64 count,
+                                     PerStepState* out_pss,
+                                     std::unique_ptr<ProfileHandler>* out_ph) {
+  out_pss->collect_timeline =
+      run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->collect_rpcs = run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->report_tensor_allocations_upon_oom =
+      run_options.report_tensor_allocations_upon_oom();
+  // Build the cost model every 'build_cost_model_every' steps after skipping an
+  // initial 'build_cost_model_after' steps.
+  const int64 build_cost_model_after =
+      session_opts_.config.graph_options().build_cost_model_after();
+  const int64 build_cost_model_every =
+      session_opts_.config.graph_options().build_cost_model();
+  out_pss->collect_costs =
+      build_cost_model_every > 0 &&
+      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+  out_pss->collect_partition_graphs = run_options.output_partition_graphs();
+
+  *out_ph = rcg->GetProfileHandler(step_id, count, run_options);
+  if (*out_ph) {
+    out_pss->collect_timeline = true;
+    out_pss->collect_rpcs = (*out_ph)->should_collect_rpcs();
+  }
+}
+
+Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
+                                     uint64 step_id,
+                                     const RunOptions& run_options,
+                                     PerStepState* pss,
+                                     const std::unique_ptr<ProfileHandler>& ph,
+                                     const Status& run_status,
+                                     RunMetadata* out_run_metadata) {
+  Status s = run_status;
+  if (s.ok()) {
+    pss->end_micros = Env::Default()->NowMicros();
+
+    // Schedule post-processing and cleanup to be done asynchronously.
+    rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
+  } else if (errors::IsCancelled(s)) {
+    mutex_lock l(mu_);
+    if (closed_) {
+      if (garbage_collected_) {
+        s = errors::Cancelled(
+            "Step was cancelled because the session was garbage collected due "
+            "to inactivity.");
+      } else {
+        s = errors::Cancelled(
+            "Step was cancelled by an explicit call to `Session::Close()`.");
+      }
+    }
+  }
+  Ref();
+  rcg->Ref();
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
+    if (!s.ok()) {
+      LOG(ERROR) << "Cleanup partition error: " << s;
+    }
+    rcg->Unref();
+    MarkRunCompletion();
+    Unref();
+  });
+  return s;
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
@@ -1597,8 +1772,8 @@ Status MasterSession::DoRunWithLocalExecution(
   BuildGraphOptions bgopts;
   BuildBuildGraphOptions(req, &bgopts);
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
-  TF_RETURN_IF_ERROR(StartStep(bgopts, &count, &rcg, false));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
 
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
@@ -1614,64 +1789,133 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
-  pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.report_tensor_allocations_upon_oom =
-      req.options().report_tensor_allocations_upon_oom();
-  // Build the cost model every 'build_cost_model_every' steps after skipping an
-  // initial 'build_cost_model_after' steps.
-  const int64 build_cost_model_after =
-      session_opts_.config.graph_options().build_cost_model_after();
-  const int64 build_cost_model_every =
-      session_opts_.config.graph_options().build_cost_model();
-  pss.collect_costs =
-      build_cost_model_every > 0 &&
-      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
-  pss.collect_partition_graphs = req.options().output_partition_graphs();
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, req.options(), step_id, count, &pss, &ph);
 
-  std::unique_ptr<ProfileHandler> ph =
-      rcg->GetProfileHandler(step_id, count, req.options());
-  if (ph) {
-    pss.collect_timeline = true;
-    pss.collect_rpcs = ph->should_collect_rpcs();
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
+                        resp->mutable_metadata());
+}
+
+Status MasterSession::MakeCallable(const MakeCallableRequest& req,
+                                   MakeCallableResponse* resp) {
+  UpdateLastAccessTime();
+
+  BuildGraphOptions opts;
+  opts.callable_options = req.options();
+  opts.use_function_convention = false;
+
+  ReffedClientGraph* callable;
+
+  {
+    mutex_lock l(mu_);
+    if (closed_) {
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    std::unique_ptr<ClientGraph> client_graph;
+    TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+    callable = new ReffedClientGraph(handle_, opts, std::move(client_graph),
+                                     session_opts_, stats_publisher_factory_,
+                                     false /* is_partial */, get_worker_cache(),
+                                     !should_delete_worker_sessions_);
+  }
+
+  Status s = BuildAndRegisterPartitions(callable);
+  if (!s.ok()) {
+    callable->Unref();
+    return s;
   }
 
+  uint64 handle;
+  {
+    mutex_lock l(mu_);
+    handle = next_callable_handle_++;
+    callables_[handle] = callable;
+  }
+
+  resp->set_handle(handle);
+  return Status::OK();
+}
+
+Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                                    const RunCallableRequest& req,
+                                    RunCallableResponse* resp) {
+  VLOG(2) << "DoRunCallable req: " << req.DebugString();
+  PerStepState pss;
+  pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
+
+  // Prepare.
+  int64 count = rcg->get_and_increment_execution_count();
+
+  // Keeps the highest 8 bits 0x01: we reserve some bits of the
+  // step_id for future use.
+  const uint64 step_id = MakeStepId();
+  TRACEPRINTF("stepid %llu", step_id);
+
+  const RunOptions& run_options = rcg->callable_options().run_options();
+
+  if (run_options.timeout_in_ms() != 0) {
+    opts->SetTimeout(run_options.timeout_in_ms());
+  }
+
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, run_options, step_id, count, &pss, &ph);
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
-                                &cancellation_manager_, false);
-  if (s.ok()) {
-    pss.end_micros = Env::Default()->NowMicros();
+                                &cancellation_manager_);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, run_options, &pss, ph, s,
+                        resp->mutable_metadata());
+}
 
-    // Schedule post-processing and cleanup to be done asynchronously.
-    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
-                      resp->mutable_metadata());
-  } else if (errors::IsCancelled(s)) {
+Status MasterSession::RunCallable(CallOptions* opts,
+                                  const RunCallableRequest& req,
+                                  RunCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* callable;
+  {
     mutex_lock l(mu_);
     if (closed_) {
-      if (garbage_collected_) {
-        s = errors::Cancelled(
-            "Step was cancelled because the session was garbage collected due "
-            "to inactivity.");
-      } else {
-        s = errors::Cancelled(
-            "Step was cancelled by an explicit call to `Session::Close()`.");
-      }
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    int64 handle = req.handle();
+    if (handle >= next_callable_handle_) {
+      return errors::InvalidArgument("No such callable handle: ", handle);
+    }
+    auto iter = callables_.find(req.handle());
+    if (iter == callables_.end()) {
+      return errors::InvalidArgument(
+          "Attempted to run callable after handle was released: ", handle);
     }
+    callable = iter->second;
+    callable->Ref();
+    ++num_running_;
   }
-  Ref();
-  rcg->Ref();
-  cleanup.release();  // MarkRunCompletion called in done closure.
-  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
-    if (!s.ok()) {
-      LOG(ERROR) << "Cleanup partition error: " << s;
+  core::ScopedUnref unref_callable(callable);
+  return DoRunCallable(opts, callable, req, resp);
+}
+
+Status MasterSession::ReleaseCallable(const ReleaseCallableRequest& req,
+                                      ReleaseCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* to_unref = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto iter = callables_.find(req.handle());
+    if (iter != callables_.end()) {
+      to_unref = iter->second;
+      callables_.erase(iter);
     }
-    rcg->Unref();
-    MarkRunCompletion();
-    Unref();
-  });
-  return s;
+  }
+  if (to_unref != nullptr) {
+    to_unref->Unref();
+  }
+  return Status::OK();
 }
 
 Status MasterSession::Close() {
@@ -1688,6 +1932,7 @@ Status MasterSession::Close() {
     }
     ClearRunsTable(&to_unref, &run_graphs_);
     ClearRunsTable(&to_unref, &partial_run_graphs_);
+    ClearRunsTable(&to_unref, &callables_);
   }
   for (ReffedClientGraph* rcg : to_unref) rcg->Unref();
   if (should_delete_worker_sessions_) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 4bd4e1367aa75730df829a2909005a221b9ab780..a05419904f54054ee439bfa5578bb4abbe0bd45e 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -89,6 +89,15 @@ class MasterSession : public core::RefCounted {
 
   Status ListDevices(ListDevicesResponse* resp) const;
 
+  Status MakeCallable(const MakeCallableRequest& req,
+                      MakeCallableResponse* resp);
+
+  Status RunCallable(CallOptions* opts, const RunCallableRequest& req,
+                     RunCallableResponse* resp);
+
+  Status ReleaseCallable(const ReleaseCallableRequest& req,
+                         ReleaseCallableResponse* resp);
+
   // Close this session and delete "*this". Returns OK if all known
   // states are cleanup successfully.
   //
@@ -140,6 +149,8 @@ class MasterSession : public core::RefCounted {
   typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ GUARDED_BY(mu_);
+  int64 next_callable_handle_ GUARDED_BY(mu_) = 0;
+  RCGMap callables_ GUARDED_BY(mu_);
 
   struct PerStepState {
     bool collect_costs = false;
@@ -205,15 +216,28 @@ class MasterSession : public core::RefCounted {
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
-  Status StartStep(const BuildGraphOptions& opts, int64* count,
-                   ReffedClientGraph** graph, bool is_partial);
+  Status StartStep(const BuildGraphOptions& opts, bool is_partial,
+                   ReffedClientGraph** out_rcg, int64* out_count);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                        const RunOptions& run_options, uint64 step_id,
+                        int64 count, PerStepState* out_pss,
+                        std::unique_ptr<ProfileHandler>* out_ph);
   Status DoRunWithLocalExecution(CallOptions* opts,
                                  const RunStepRequestWrapper& req,
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  Status DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                       const RunCallableRequest& req,
+                       RunCallableResponse* resp);
+  Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+                        const RunOptions& run_options, PerStepState* pss,
+                        const std::unique_ptr<ProfileHandler>& ph,
+                        const Status& run_status,
+                        RunMetadata* out_run_metadata);
+
   void MarkRunCompletion();
   void UpdateLastAccessTime();
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 66ebb3080af7cd01021362b5ea0c0b54458aebfc..18668b44d3c6da13b4e8db717da008accb311cbc 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -326,6 +326,20 @@ Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  Tensor tensor;
+  if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
+    return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
+  }
+  sends_.emplace_back(send_key, std::move(tensor));
+  return Status::OK();
+}
+
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
 const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
@@ -439,6 +453,18 @@ Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  NamedTensorProto* send = request_.add_send();
+  send->set_name(send_key);
+  *send->mutable_tensor() = run_callable_request.feed(i);
+  return Status::OK();
+}
+
 size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 79fa6f926ea6afb351eacf279d3cf493b6d4713f..1f7cdb98a41ec017db1bd7fa0b7c6f9bb2299021 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -302,6 +302,9 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   virtual Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) = 0;
+  virtual Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) = 0;
 
   virtual void add_recv_key(const string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
@@ -334,6 +337,9 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
@@ -385,6 +391,9 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 9c655bfa312488e2bb435ea7c10a3cede2ab3bf2..fa0f8c9b5250b100f1c261b99b4466a25467327e 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -189,7 +189,6 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
@@ -235,22 +234,11 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:master_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
 )
 
-cc_library(
-    name = "grpc_serialization_traits",
-    srcs = [],
-    hdrs = ["grpc_serialization_traits.h"],
-    deps = [
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
-    ],
-)
-
 cc_library(
     name = "rpc_rendezvous_mgr",
     srcs = ["rpc_rendezvous_mgr.cc"],
@@ -499,3 +487,33 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
+
+cc_library(
+    name = "grpc_rpc_factory",
+    srcs = [
+        "grpc_rpc_factory.cc",
+    ],
+    hdrs = ["grpc_rpc_factory.h"],
+    deps = [
+        ":grpc_state",
+        ":grpc_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+    ],
+)
+
+cc_library(
+    name = "grpc_rpc_factory_registration",
+    srcs = [
+        "grpc_rpc_factory_registration.cc",
+    ],
+    deps = [
+        ":grpc_rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 63745e8ebd835c6740268e2eb2fc50a1100210f8..23968e24c87ee1d12cab973b6d9568f1af706a0e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -111,6 +111,11 @@ class GrpcMasterService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(CloseSession, false);
     ENQUEUE_REQUEST(ListDevices, false);
     ENQUEUE_REQUEST(Reset, false);
+    ENQUEUE_REQUEST(MakeCallable, false);
+    for (int i = 0; i < 100; ++i) {
+      ENQUEUE_REQUEST(RunCallable, true);
+    }
+    ENQUEUE_REQUEST(ReleaseCallable, false);
 
     void* tag;
     bool ok;
@@ -236,6 +241,47 @@ class GrpcMasterService : public AsyncServiceInterface {
                         });
     ENQUEUE_REQUEST(Reset, false);
   }
+
+  // RPC handler for making a callable.
+  void MakeCallableHandler(
+      MasterCall<MakeCallableRequest, MakeCallableResponse>* call) {
+    master_impl_->MakeCallable(&call->request, &call->response,
+                               [call](const Status& status) {
+                                 call->SendResponse(ToGrpcStatus(status));
+                               });
+    ENQUEUE_REQUEST(MakeCallable, false);
+  }
+
+  // RPC handler for running a callable.
+  void RunCallableHandler(
+      MasterCall<RunCallableRequest, RunCallableResponse>* call) {
+    auto* trace = TraceRpc("RunCallable/Server", call->client_metadata());
+    CallOptions* call_opts = new CallOptions;
+    // The timeout may be overridden by a non-zero timeout in the
+    // callable's `RunOptions`; this overriding will happen inside the
+    // `MasterSession` implementation.
+    call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
+    call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+    master_impl_->RunCallable(call_opts, &call->request, &call->response,
+                              [call, call_opts, trace](const Status& status) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                delete trace;
+                                call->SendResponse(ToGrpcStatus(status));
+                              });
+    ENQUEUE_REQUEST(RunCallable, false);
+  }
+
+  // RPC handler for making a callable.
+  void ReleaseCallableHandler(
+      MasterCall<ReleaseCallableRequest, ReleaseCallableResponse>* call) {
+    master_impl_->ReleaseCallable(&call->request, &call->response,
+                                  [call](const Status& status) {
+                                    call->SendResponse(ToGrpcStatus(status));
+                                  });
+    ENQUEUE_REQUEST(ReleaseCallable, false);
+  }
+
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index e2016e824c0bf504af4c624cad253963b223eb35..c832adbbbf8eba1ec512d62470025fb56a39b8a4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -36,6 +36,9 @@ static const char* grpcMasterService_method_names[] = {
     "/tensorflow.MasterService/CloseSession",
     "/tensorflow.MasterService/ListDevices",
     "/tensorflow.MasterService/Reset",
+    "/tensorflow.MasterService/MakeCallable",
+    "/tensorflow.MasterService/RunCallable",
+    "/tensorflow.MasterService/ReleaseCallable",
 };
 
 std::unique_ptr<MasterService::Stub> MasterService::NewStub(
@@ -64,7 +67,14 @@ MasterService::Stub::Stub(
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_MakeCallable_(grpcMasterService_method_names[7],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RunCallable_(grpcMasterService_method_names[8],
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_ReleaseCallable_(grpcMasterService_method_names[9],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
@@ -115,8 +125,29 @@ MasterService::Stub::Stub(
                                              context, request, response);
 }
 
+::grpc::Status MasterService::Stub::MakeCallable(
+    ::grpc::ClientContext* context, const MakeCallableRequest& request,
+    MakeCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_MakeCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::RunCallable(
+    ::grpc::ClientContext* context, const RunCallableRequest& request,
+    RunCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RunCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::ReleaseCallable(
+    ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+    ReleaseCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReleaseCallable_, context, request, response);
+}
+
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 7; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 6ae94b74417c3fb6c4da1589bb9f532cb6d79930..8f1b589698276d5df7aa0245d57bc5bdb4a9f0db 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -25,18 +25,8 @@ limitations under the License.
 #include "grpc++/impl/codegen/stub_options.h"
 #include "grpc++/impl/codegen/sync_stream.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::CreateSessionRequest);
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::ExtendSessionRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepResponse);
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
@@ -79,6 +69,15 @@ class MasterService final {
     virtual ::grpc::Status Reset(::grpc::ClientContext* context,
                                  const ResetRequest& request,
                                  ResetResponse* response) = 0;
+    virtual ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                        const MakeCallableRequest& request,
+                                        MakeCallableResponse* response) = 0;
+    virtual ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                                       const RunCallableRequest& request,
+                                       RunCallableResponse* response) = 0;
+    virtual ::grpc::Status ReleaseCallable(
+        ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+        ReleaseCallableResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -104,6 +103,15 @@ class MasterService final {
     ::grpc::Status Reset(::grpc::ClientContext* context,
                          const ResetRequest& request,
                          ResetResponse* response) override;
+    ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                const MakeCallableRequest& request,
+                                MakeCallableResponse* response) override;
+    ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                               const RunCallableRequest& request,
+                               RunCallableResponse* response) override;
+    ::grpc::Status ReleaseCallable(::grpc::ClientContext* context,
+                                   const ReleaseCallableRequest& request,
+                                   ReleaseCallableResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -114,6 +122,9 @@ class MasterService final {
     const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
     const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
     const ::grpc::internal::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_MakeCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReleaseCallable_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -179,6 +190,30 @@ class MasterService final {
       ::grpc::Service::RequestAsyncUnary(6, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestMakeCallable(
+        ::grpc::ServerContext* context, MakeCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<MakeCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(7, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunCallable(
+        ::grpc::ServerContext* context, RunCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(8, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReleaseCallable(
+        ::grpc::ServerContext* context, ReleaseCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ReleaseCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(9, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 1088e9be66ceb7fbddfaed0691423745f362343f..1b92a79a67eae25a1b5e2942006da1246ca24b4b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -95,6 +95,28 @@ class GrpcRemoteMaster : public MasterInterface {
                 &MasterServiceStub::Reset);
   }
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::MakeCallable);
+  }
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::RunCallable);
+  }
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::ReleaseCallable);
+  }
+
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
   port::Tracing::TraceMe TraceRpc(StringPiece name,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d004abd1c189f72ed25c8e62e66a798d480fee78
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -0,0 +1,213 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+
+namespace tensorflow {
+
+namespace {
+class GrpcCall {
+ public:
+  explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
+                    const string* request_msg, string* response_msg,
+                    int32* status_code, string* status_message)
+      : container_(container),
+        index_(index),
+        try_rpc_(try_rpc),
+        request_msg_(request_msg),
+        response_msg_(response_msg),
+        status_code_(status_code),
+        status_message_(status_message) {}
+
+  void StartCancel() { call_opts_.StartCancel(); }
+
+  void Done(const Status& s) {
+    DCHECK(container_ != nullptr);
+    if (!s.ok() && try_rpc_) {
+      DCHECK(status_code_ != nullptr);
+      DCHECK(status_message_ != nullptr);
+      *status_code_ = s.code();
+      *status_message_ = s.error_message();
+    }
+    container_->Done(s, index_);
+  }
+
+  const string& request() const { return *request_msg_; }
+  string* response() const { return response_msg_; }
+  CallOptions* call_opts() { return &call_opts_; }
+
+ private:
+  CallContainer<GrpcCall>* const container_;
+  const int index_;
+  bool try_rpc_;
+  CallOptions call_opts_;
+  const string* request_msg_;
+  string* response_msg_;
+  int* status_code_;
+  string* status_message_;
+};
+
+}  // namespace
+
+GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                               int64 timeout_in_ms)
+    : RPCFactory(), fail_fast_(fail_fast), timeout_in_ms_(timeout_in_ms) {
+  // TODO(ebrevdo): Investigate possible performance improvements by
+  // replacing this thread with a threadpool.
+  polling_thread_ =
+      ctx->env()->StartThread(ThreadOptions(), "rpc_op_grpc_factory", [this]() {
+        void* tag;
+        bool ok;
+        while (completion_queue_.Next(&tag, &ok)) {
+          GrpcClientCQTag* callback_tag = static_cast<GrpcClientCQTag*>(tag);
+          callback_tag->OnCompleted(ok);
+        }
+      });
+}
+
+GrpcRPCFactory::~GrpcRPCFactory() {
+  // The amount of time we wait depends on several parameters, including:
+  //   - the value of the fail_fast attribute.
+  //   - the timeout option of the rpc call in the proto declaration.
+  //   - the network roundtrip time and service's execution time.
+  //
+  // If a connection is made but the service doesn't ever respond, and
+  // there is no timeout option set for this rpc call, then it is
+  // possible the RPC request will wait forever.
+  //
+  completion_queue_.Shutdown();
+  delete polling_thread_;
+}
+
+void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
+                          const Tensor& address_t, const Tensor& method_t,
+                          const Tensor& request_t, const bool try_rpc,
+                          Tensor* response_t, Tensor* status_code_t,
+                          Tensor* status_message_t,
+                          AsyncOpKernel::DoneCallback done) {
+  auto address = address_t.flat<string>();
+  auto method = method_t.flat<string>();
+  auto request = request_t.flat<string>();
+
+  // Stubs are maintained by the GrpcRPCFactory class and will be
+  // deleted when the class is destroyed.
+  ::grpc::GenericStub* singleton_stub = nullptr;
+  if (address.size() == 1) {
+    singleton_stub = GetOrCreateStubForAddress(address(0));
+  }
+  auto get_stub = [&address, this,
+                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
+    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
+                                : singleton_stub;
+  };
+  auto get_method_ptr = [&method](int64 ix) -> const string* {
+    return (method.size() > 1) ? &(method(ix)) : &(method(0));
+  };
+  auto get_request_ptr = [&request](int64 ix) -> const string* {
+    return (request.size() > 1) ? &(request(ix)) : &(request(0));
+  };
+
+  if (try_rpc) {
+    // In this case status_code will never be set in the response,
+    // so we just set it to OK.
+    DCHECK(status_code_t != nullptr);
+    status_code_t->flat<int32>().setConstant(
+        static_cast<int>(errors::Code::OK));
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken cancellation_token = cm->get_cancellation_token();
+
+  // This object will delete itself when done.
+  auto* container =
+      new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
+                                  std::move(done), cancellation_token);
+
+  auto response = response_t->flat<string>();
+  int32* status_code_ptr = nullptr;
+  string* status_message_ptr = nullptr;
+  if (try_rpc) {
+    status_code_ptr = status_code_t->flat<int32>().data();
+    status_message_ptr = status_message_t->flat<string>().data();
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    container->calls()->emplace_back(
+        container, i, try_rpc, get_request_ptr(i), &response(i),
+        (try_rpc) ? &status_code_ptr[i] : nullptr,
+        (try_rpc) ? &status_message_ptr[i] : nullptr);
+  }
+
+  int i = 0;
+  for (GrpcCall& call : *(container->calls())) {
+    // This object will delete itself when done.
+    new RPCState<string>(get_stub(i), &completion_queue_, *get_method_ptr(i),
+                         call.request(), call.response(),
+                         /*done=*/[&call](const Status& s) { call.Done(s); },
+                         call.call_opts(), fail_fast_, timeout_in_ms_);
+    ++i;
+  }
+
+  // Need to register this callback after all the RPCs are in
+  // flight; otherwise we may try to cancel an RPC *before* it
+  // launches, which is a no-op, and then fall into a deadlock.
+  bool is_cancelled = !cm->RegisterCallback(
+      cancellation_token, [container]() { container->StartCancel(); });
+
+  if (is_cancelled) {
+    ctx->SetStatus(errors::Cancelled("Operation has been cancelled."));
+    // container's reference counter will take care of calling done().
+    container->StartCancel();
+  }
+}
+
+::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress(
+    const string& address) {
+  mutex_lock lock(mu_);
+
+  auto stub = stubs_.find(address);
+  if (stub != stubs_.end()) return stub->second.get();
+
+  ChannelPtr channel = CreateChannelForAddress(address);
+  auto* created = new ::grpc::GenericStub(channel);
+  stubs_[address].reset(created);
+  return created;
+}
+
+GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
+    const string& address) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+
+  // Set a standard backoff timeout of 1s instead of the
+  // (sometimes default) 20s.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  return ::grpc::CreateCustomChannel(
+      /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..34ec235aafc1535f27f5943b48e8b8afdcee43ac
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class GrpcRPCFactory : public RPCFactory {
+ public:
+  explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                          int64 timeout_in_ms);
+
+  // Explicit destructor to control destruction order.
+  ~GrpcRPCFactory() override;
+
+  void Call(OpKernelContext* ctx, int64 num_elements, const Tensor& address_t,
+            const Tensor& method_t, const Tensor& request_t, const bool try_rpc,
+            Tensor* response_t, Tensor* status_code_t, Tensor* status_message_t,
+            AsyncOpKernel::DoneCallback done) override;
+
+ protected:
+  typedef std::shared_ptr<::grpc::Channel> ChannelPtr;
+  virtual ChannelPtr CreateChannelForAddress(const string& address);
+
+ private:
+  ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address);
+
+  bool fail_fast_;
+  int64 timeout_in_ms_;
+  ::grpc::CompletionQueue completion_queue_;
+  Thread* polling_thread_;  // Owned.
+
+  mutex mu_;
+  typedef std::unique_ptr<::grpc::GenericStub> StubPtr;
+  std::unordered_map<string, StubPtr> stubs_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b884489378464d7271e31e0ae1d180134becc6dc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Used for adding the grpc factory to the RPC factory registry.
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return new GrpcRPCFactory(ctx, fail_fast, timeout_in_ms);
+  }
+};
+
+REGISTER_RPC_FACTORY("grpc", Value::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
deleted file mode 100644
index e7f5fb0c6ae24caa3ffe5039d5daddb771c4858d..0000000000000000000000000000000000000000
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/support/slice.h"
-#include "grpc/grpc.h"
-
-namespace grpc {
-
-namespace tensorflow_helper {
-
-const int kGrpcBufferWriterMaxBufferLength = 8192;
-
-class GrpcBufferWriter final
-    : public ::grpc::protobuf::io::ZeroCopyOutputStream {
- public:
-  explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
-      : block_size_(block_size), byte_count_(0), have_backup_(false) {
-    *bp = grpc_raw_byte_buffer_create(NULL, 0);
-    slice_buffer_ = &(*bp)->data.raw.slice_buffer;
-  }
-
-  ~GrpcBufferWriter() override {
-    if (have_backup_) {
-      grpc_slice_unref(backup_slice_);
-    }
-  }
-
-  bool Next(void** data, int* size) override {
-    if (have_backup_) {
-      slice_ = backup_slice_;
-      have_backup_ = false;
-    } else {
-      slice_ = grpc_slice_malloc(block_size_);
-    }
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    grpc_slice_buffer_add(slice_buffer_, slice_);
-    return true;
-  }
-
-  void BackUp(int count) override {
-    grpc_slice_buffer_pop(slice_buffer_);
-    if (count == block_size_) {
-      backup_slice_ = slice_;
-    } else {
-      backup_slice_ =
-          grpc_slice_split_tail(&slice_, GRPC_SLICE_LENGTH(slice_) - count);
-      grpc_slice_buffer_add(slice_buffer_, slice_);
-    }
-    // It's dangerous to keep an inlined grpc_slice as the backup slice, since
-    // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an address held by
-    // slice_buffer_.
-    have_backup_ = backup_slice_.refcount != NULL;
-    byte_count_ -= count;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override { return byte_count_; }
-
- private:
-  const int block_size_;
-  int64_t byte_count_;
-  grpc_slice_buffer* slice_buffer_;
-  bool have_backup_;
-  grpc_slice backup_slice_;
-  grpc_slice slice_;
-};
-
-class GrpcBufferReader final
-    : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    (void)grpc_byte_buffer_reader_init(&reader_, buffer);
-  }
-  ~GrpcBufferReader() override { grpc_byte_buffer_reader_destroy(&reader_); }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
-      backup_count_ = 0;
-      return true;
-    }
-    if (!grpc_byte_buffer_reader_next(&reader_, &slice_)) {
-      return false;
-    }
-    grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-}  // namespace tensorflow_helper
-
-// Defines specialized serialization/deserialization routines that
-// default to allowing a 2GB max message size.
-//
-// To instantiate this template for a particular type `T`, use
-// `TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(T)`, as defined below.
-template <typename T>
-class UnlimitedSizeProtoSerializationTraits {
- public:
-  static Status Serialize(const T& msg, grpc_byte_buffer** bp,
-                          bool* own_buffer) {
-    *own_buffer = true;
-    int byte_size = msg.ByteSize();
-    if (byte_size < 0) {
-      return Status(StatusCode::INTERNAL, "Message length was negative");
-    } else if (byte_size <=
-               tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      grpc_slice slice = grpc_slice_malloc(byte_size);
-      GPR_CODEGEN_ASSERT(
-          GRPC_SLICE_END_PTR(slice) ==
-          msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
-      *bp = grpc_raw_byte_buffer_create(&slice, 1);
-      grpc_slice_unref(slice);
-      return Status::OK;
-    } else {
-      tensorflow_helper::GrpcBufferWriter writer(
-          bp, tensorflow_helper::kGrpcBufferWriterMaxBufferLength);
-      return msg.SerializeToZeroCopyStream(&writer)
-                 ? Status::OK
-                 : Status(StatusCode::INTERNAL, "Failed to serialize message");
-    }
-  }
-
-  static Status Deserialize(grpc_byte_buffer* buffer, T* msg,
-                            int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-    Status result = Status::OK;
-    {
-      tensorflow_helper::GrpcBufferReader reader(buffer);
-      ::grpc::protobuf::io::CodedInputStream decoder(&reader);
-      if (max_message_size == 0) {
-        // NOTE(mrry): Override maximum message size to 2GB.
-        decoder.SetTotalBytesLimit(INT_MAX, INT_MAX);
-      } else {
-        decoder.SetTotalBytesLimit(max_message_size, max_message_size);
-      }
-      if (!msg->ParseFromCodedStream(&decoder)) {
-        result = Status(StatusCode::INTERNAL, msg->InitializationErrorString());
-      }
-      if (!decoder.ConsumedEntireMessage()) {
-        result = Status(StatusCode::INTERNAL, "Did not read entire message");
-      }
-    }
-    grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-
-}  // namespace grpc
-
-// For the given protobuf message type `MessageType`, specializes the
-// gRPC serialization and deserialization such that the default
-// maximum message size is 2GB.
-#define TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(MessageType)             \
-  namespace grpc {                                                    \
-  template <>                                                         \
-  class SerializationTraits<MessageType>                              \
-      : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
-  }  // namespace grpc
-
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 3e79a406831fbaee2fa51348463cd425bfd9614e..fd1c150fa7aab95bee0c492ce553b9c7f58cd487 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -91,6 +91,15 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+Status GrpcSession::Handle(string* out_handle) {
+  mutex_lock l(mu_);
+  if (handle_.empty()) {
+    return errors::InvalidArgument("A session is not created yet....");
+  }
+  *out_handle = handle_;
+  return Status::OK();
+}
+
 Status GrpcSession::CreateImpl(CallOptions* call_options,
                                const GraphDef& graph) {
   {
@@ -274,14 +283,9 @@ Status GrpcSession::Run(const std::vector<std::pair<string, Tensor>>& inputs,
 Status GrpcSession::RunProto(CallOptions* call_options,
                              MutableRunStepRequestWrapper* req,
                              MutableRunStepResponseWrapper* resp) {
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req->set_session_handle(handle_);
-  }
+  string handle;
+  TF_RETURN_IF_ERROR(Handle(&handle));
+  req->set_session_handle(handle);
   return master_->RunStep(call_options, req, resp);
 }
 
@@ -293,14 +297,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   PartialRunSetupRequest req;
   PartialRunSetupResponse resp;
   CallOptions call_options;
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req.set_session_handle(handle_);
-  }
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   for (const string& feed : input_names) {
     req.add_feed(feed);
   }
@@ -400,6 +397,55 @@ Status GrpcSession::Reset(const SessionOptions& options,
   return ret;
 }
 
+Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
+                                 CallableHandle* out_handle) {
+  MakeCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  *req.mutable_options() = callable_options;
+  MakeCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->MakeCallable(&call_options, &req, &resp));
+  *out_handle = resp.handle();
+  return Status::OK();
+}
+
+Status GrpcSession::RunCallable(CallableHandle handle,
+                                const std::vector<Tensor>& feed_tensors,
+                                std::vector<Tensor>* fetch_tensors,
+                                RunMetadata* run_metadata) {
+  RunCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  for (const Tensor& feed : feed_tensors) {
+    feed.AsProtoTensorContent(req.mutable_feed()->Add());
+  }
+
+  RunCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->RunCallable(&call_options, &req, &resp));
+  for (const TensorProto& fetch : resp.fetch()) {
+    Tensor fetch_tensor;
+    if (!fetch_tensor.FromProto(cpu_allocator(), fetch)) {
+      return errors::Internal(
+          "Could not parse fetched tensor data in response from master.");
+    }
+    fetch_tensors->push_back(std::move(fetch_tensor));
+  }
+  return Status::OK();
+}
+
+Status GrpcSession::ReleaseCallable(CallableHandle handle) {
+  ReleaseCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  ReleaseCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  return master_->ReleaseCallable(&call_options, &req, &resp);
+}
+
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index d87956a13515fde533e746d2abd04e4a2f4959ae..63795117f9763434f5ff331d3d2d3bdb99413e81 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -82,20 +82,27 @@ class GrpcSession : public Session {
   Status Close() override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRunSetup(const std::vector<string>& input_names,
-                                 const std::vector<string>& output_names,
-                                 const std::vector<string>& target_nodes,
-                                 string* handle) override;
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRun(
-      const string& handle,
-      const std::vector<std::pair<string, Tensor> >& inputs,
-      const std::vector<string>& output_names,
-      std::vector<Tensor>* outputs) override;
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
 
   Status ListDevices(std::vector<DeviceAttributes>* response) override;
 
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override;
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override;
+  Status ReleaseCallable(CallableHandle handle) override;
+
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
@@ -111,6 +118,8 @@ class GrpcSession : public Session {
   // The current version of the graph.
   int64 current_graph_version_ GUARDED_BY(mu_);
 
+  Status Handle(string* out_handle) LOCKS_EXCLUDED(mu_);
+
   Status RunHelper(const RunOptions& run_options,
                    const std::vector<std::pair<string, Tensor> >& inputs,
                    const std::vector<string>& output_tensor_names,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 335c3febe20e17e5b5ea57dc68c69e616997e14b..45b15a54a29b481b4888515f18bd913d71c1013c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -120,6 +120,49 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
   }
 }
 
+TEST(GrpcSessionTest, BasicCallable) {
+  GraphDef graph;
+  string node_names[3];
+  // c = a * b
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  for (int iters = 0; iters < 25; ++iters) {
+    TF_CHECK_OK(session->Create(graph));
+    {
+      // Just run to target node
+      CallableOptions opts;
+      opts.add_target(node_names[2]);
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      TF_CHECK_OK(session->RunCallable(handle, {}, nullptr, nullptr));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+    {
+      // Run to a target node and a real tensor
+      CallableOptions opts;
+      opts.add_target(node_names[1]);
+      opts.add_fetch(node_names[2] + ":0");
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      std::vector<Tensor> outputs;
+      TF_CHECK_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      ASSERT_EQ(1, outputs.size());
+      ASSERT_TRUE(outputs[0].IsInitialized());
+      ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+
+    TF_CHECK_OK(session->Close());
+  }
+}
+
 TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
   GraphDef graph;
   string node_names[3];
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 2a2f7e3ffbef10f9f2997fc554f010d3f8689ca2..62b299d5c2c15ee4b4bbf3071ed2f8e194d353e7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,24 +26,16 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RegisterGraphRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphResponse);
-
 namespace tensorflow {
 class GrpcByteSource : public TensorResponse::Source {
  public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
   ~GrpcByteSource() override { DeleteStream(); }
 
-  typedef ::grpc::tensorflow_helper::GrpcBufferReader Reader;
+  typedef ::grpc::GrpcProtoBufferReader Reader;
 
   protobuf::io::ZeroCopyInputStream* contents() override {
     DeleteStream();
@@ -58,7 +50,7 @@ class GrpcByteSource : public TensorResponse::Source {
     }
   }
 
-  grpc_byte_buffer* buffer_;  // Not owned
+  ::grpc::ByteBuffer* buffer_;  // Not owned
   Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
   char space_[sizeof(Reader)];
 };
@@ -74,17 +66,15 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::TensorResponse.
 // Wire-format is identical to RecvTensorResponse.
 template <>
-class SerializationTraits<tensorflow::TensorResponse>
-    : public UnlimitedSizeProtoSerializationTraits<tensorflow::TensorResponse> {
+class SerializationTraits<tensorflow::TensorResponse> {
  public:
-  static Status Serialize(const tensorflow::TensorResponse& msg,
-                          grpc_byte_buffer** bp, bool* own_buffer) {
+  static Status Serialize(const tensorflow::TensorResponse& msg, ByteBuffer* bp,
+                          bool* own_buffer) {
     LOG(FATAL) << "TODO(sanjay,jeff): Implement";
     return Status();
   }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            tensorflow::TensorResponse* msg,
-                            int max_message_size = INT_MAX) {
+  static Status Deserialize(ByteBuffer* buffer,
+                            tensorflow::TensorResponse* msg) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
@@ -98,7 +88,7 @@ class SerializationTraits<tensorflow::TensorResponse>
                             "TensorResponse parse error", s.ToString()));
       }
     }
-    grpc_byte_buffer_destroy(buffer);
+    buffer->Clear();
     return result;
   }
 };
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 6182f95f285dd0cf38cca77165a4c2fd001a4b44..1a7e5219cd243a1e0eb64c13317f109fe5a93336 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -88,20 +88,15 @@ void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
 
-class CPUAllocator : public VisitableAllocator {
+class CPUAllocator : public Allocator {
  public:
-  CPUAllocator()
-      : total_allocation_warning_triggered_(false), allocation_begun_(false) {}
+  CPUAllocator() : total_allocation_warning_triggered_(false) {}
 
   ~CPUAllocator() override {}
 
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (!allocation_begun_) {
-      allocation_begun_ = true;
-    }
-
     if (num_bytes > LargeAllocationWarningBytes()) {
       LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
                    << 100 * kLargeAllocationWarningThreshold
@@ -127,38 +122,16 @@ class CPUAllocator : public VisitableAllocator {
         total_allocation_warning_triggered_ = true;
       }
     }
-
-    // visit each Visitor in alloc_visitors_
-    if (p != nullptr) {
-      for (const Visitor& v : alloc_visitors_) {
-        v(p, num_bytes);
-      }
-    }
-
     return p;
   }
 
   void DeallocateRaw(void* ptr) override {
-    std::size_t alloc_size;
-    bool init_alloc_size = false;
     if (cpu_allocator_collect_stats) {
-      alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-      init_alloc_size = true;
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-
-    // visit each Visitor in free_visitors_
-    if (ptr != nullptr) {
-      if (!init_alloc_size) {
-        alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-        init_alloc_size = true;
-      }
-      for (const Visitor& v : free_visitors_) {
-        v(ptr, alloc_size);
-      }
-    }
-
     port::AlignedFree(ptr);
   }
 
@@ -178,37 +151,11 @@ class CPUAllocator : public VisitableAllocator {
     return port::MallocExtension_GetAllocatedSize(ptr);
   }
 
-  // REQUIRES: can only add visitors before the first Allocate call
-
-  void AddAllocVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddAllocVisitor may not be called after allocation has begun.";
-    alloc_visitors_.push_back(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddFreeVisitor may not be called after allocation has begun.";
-    free_visitors_.push_back(visitor);
-  }
-
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
   bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
 
-  // visitor_mutex_ protects write access to alloc_visitors_ and free_visitors_.
-  // While write access is mutually exclusive, reads may happen concurrently.
-  // This is okay because we may only append to alloc_visitors_ and
-  // free_visitors_ before first allocation, and subsequently we only read these
-  // vectors.
-  mutex visitor_mutex_;
-  std::vector<Visitor> alloc_visitors_;
-  std::vector<Visitor> free_visitors_;
-  std::atomic<bool> allocation_begun_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index e4fad917ffe1d4a0790bf1fd56e3c72f841523d8..1a3994736cb5627c590c3029c7b9e163dff2351c 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <numeric>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 362d345133aa292ac1755c0bc8c0ab04d10efab6..a82fb50d880b0aa2f4768832a0ff5444ed61f45e 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -103,11 +103,8 @@ struct CollectiveParams {
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
   std::vector<int> subdiv_source_rank;
-  const Tensor* in_tensor;             // kernel input
-  Tensor* out_tensor;                  // kernel output
   std::unique_ptr<OpKernel> merge_op;  // reduction only
   std::unique_ptr<OpKernel> final_op;  // reduction only
-  OpKernelContext* op_context;
   string ToString() const;
 };
 
@@ -181,7 +178,7 @@ class StepSequenceInterface {
   virtual void RefreshStepIdSequenceAsync(int64 graph_key,
                                           const StatusCallback& done) = 0;
 
-  // Returns the the step_id that should be used for initiating a new execution
+  // Returns the step_id that should be used for initiating a new execution
   // on the specified graph. May return the same step_id multiple times if
   // RetireStepId or RefreshStepIdReservation is not called.
   virtual int64 NextStepId(int64 graph_key) = 0;
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index dab53cba3e6343c82052c2997c2947b1d11bed59..b1d01278098b5126aa974c5c2b55868fe8810e95 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -111,7 +111,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
 }  // namespace numext
 }  // namespace Eigen
 
-#if defined(COMPILER_MSVC) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 namespace std {
 template <>
 struct hash<Eigen::half> {
@@ -120,6 +120,6 @@ struct hash<Eigen::half> {
   }
 };
 }  // namespace std
-#endif  // COMPILER_MSVC
+#endif  // _MSC_VER
 
 #endif  // TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index cfde1e8ea33b46f84a1fb185d5e2dc45e116deec..05171006b0c7b2f389fd48f57af191bbf13d88b8 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(StringPiece(type_string()).starts_with("_")),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 54ecaa5dd431d435fe04948223d565802d525be0..229b4a45fa9c6db51da0c791a014dd93f0bef8ae 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -40,6 +40,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const TensorShapeProto& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -50,6 +51,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const TensorShapeProto& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromShapeProto(p, &shape));
@@ -93,6 +95,7 @@ InferenceContext::InferenceContext(
     : graph_def_version_(graph_def_version),
       node_def_(CHECK_NOTNULL(node_def)) {
   std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  input_tensors_as_shape_handles.reserve(input_tensors_as_shapes.size());
   for (const PartialTensorShape& p : input_tensors_as_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -103,6 +106,7 @@ InferenceContext::InferenceContext(
   }
   PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
   if (!construction_status_.ok()) return;
+  inputs_.reserve(input_shapes.size());
   for (const PartialTensorShape& p : input_shapes) {
     ShapeHandle shape;
     construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
@@ -229,9 +233,7 @@ void InferenceContext::PreInputInit(
   for (const auto& e : output_name_map_) {
     num_outputs = std::max(num_outputs, e.second.second);
   }
-  for (int i = 0; i < num_outputs; ++i) {
-    outputs_.push_back(nullptr);
-  }
+  outputs_.assign(num_outputs, nullptr);
   output_handle_shapes_and_types_.resize(num_outputs);
 }
 
@@ -469,13 +471,15 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   TF_RETURN_IF_ERROR(WithRankAtLeast(s, rank, &s));
 
   // Merge the prefix dims and create the new output shapes.
+  const int32 rank_s = Rank(s);
   std::vector<DimensionHandle> dims;
+  dims.reserve(std::max(rank, rank_s));
   dims.resize(rank);
   for (int i = 0; i < rank; ++i) {
     TF_RETURN_IF_ERROR(Merge(Dim(s, i), Dim(prefix, i), &dims[i]));
   }
   *prefix_out = MakeShape(dims);
-  for (int i = rank; i < Rank(s); ++i) dims.push_back(Dim(s, i));
+  for (int i = rank; i < rank_s; ++i) dims.push_back(Dim(s, i));
   *s_out = MakeShape(dims);
   return Status::OK();
 }
@@ -726,6 +730,24 @@ ShapeHandle InferenceContext::Matrix(DimensionOrConstant dim1,
   return MakeShape({dim1, dim2});
 }
 
+Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+    int input_idx, ShapeHandle* out) {
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
+
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  if (input_idx < input_tensors_as_shapes_.size() &&
+      input_tensors_as_shapes_[input_idx].IsSet() &&
+      RankKnown(input_tensors_as_shapes_[input_idx])) {
+    *out = input_tensors_as_shapes_[input_idx];
+    return Status::OK();
+  }
+
+  return InternalMakeShapeFromTensor(
+      true /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
+}
+
 Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
                                                   ShapeHandle* out) {
   ShapeHandle input_shape;
@@ -739,13 +761,31 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
     return Status::OK();
   }
 
-  return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
 }
 
 Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
                                              ShapeHandle tensor_shape,
                                              ShapeHandle* out) {
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */, t, tensor_shape,
+      out);
+}
+
+Status InferenceContext::InternalMakeShapeFromTensor(
+    bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+    ShapeHandle tensor_shape, ShapeHandle* out) {
+  // Only callers who have set
+  if (!treat_unknown_scalar_tensor_as_unknown_shape) {
+    TF_RETURN_IF_ERROR(WithRank(tensor_shape, 1, &tensor_shape));
+  }
   if (t == nullptr) {
+    // This is guarded by the check above.
+    if (Rank(tensor_shape) == 0) {
+      return ReturnUnknownShape(out);
+    }
     // Shape tensor is not known, but if the shape of the shape tensor is then
     // the right number of unknown dims can be created.
     DimensionHandle shape_dim = Dim(tensor_shape, 0);
@@ -759,10 +799,46 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
     return ReturnCreatedShape(dims, out);
   }
 
+  if (t->shape().dims() == 0) {
+    if (t->dtype() == DataType::DT_INT32) {
+      auto flat_t = t->scalar<int32>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else if (t->dtype() == DataType::DT_INT64) {
+      auto flat_t = t->scalar<int64>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else {
+      *out = nullptr;
+      return errors::InvalidArgument(
+          "Input tensor must be int32 or int64, but was ",
+          DataTypeString(t->dtype()));
+    }
+  }
+
   if (t->shape().dims() != 1) {
     *out = nullptr;
-    return errors::InvalidArgument("Input tensor must be rank 1, but was rank ",
-                                   t->shape().dims());
+    return errors::InvalidArgument(
+        "Input tensor must be rank 1, but was rank ", t->shape().dims(), ".",
+        ((t->shape().dims() == 0)
+             ? "If it is rank 0 rank 0 it must have statically known value -1 "
+               "(representing an unknown shape). "
+             : " "),
+        "Saw tensor shape ", t->shape().DebugString());
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
@@ -1033,6 +1109,7 @@ Status InferenceContext::Max(DimensionHandle first, DimensionOrConstant second,
 
 Status InferenceContext::AttachContext(const Status& status) {
   std::vector<string> input_shapes;
+  input_shapes.reserve(inputs_.size());
   for (const ShapeHandle& input_shape : inputs_) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
@@ -1040,6 +1117,7 @@ Status InferenceContext::AttachContext(const Status& status) {
   // Add information about the input tensors and partial tensor shapes used.
   std::vector<string> input_from_tensors_str;
   std::vector<string> input_from_tensors_as_shape_str;
+  input_from_tensors_as_shape_str.reserve(inputs_.size());
   for (int i = 0; i < inputs_.size(); ++i) {
     if (requested_input_tensor_as_partial_shape_[i] &&
         i < input_tensors_as_shapes_.size() &&
@@ -1161,9 +1239,7 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
   if (!refined) {
     return false;
   }
-  for (int i = 0; i < new_values.size(); ++i) {
-    (*to_update)[i] = new_values[i];
-  }
+  to_update->swap(new_values);
   return true;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index accc587000767554f87a195e0ea33640cd696244..cdb4bd79bbb9040ad0a40eaa2f30134f5de79786 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -463,6 +463,12 @@ class InferenceContext {
   // the input tensor is NULL, then an unknown shape is returned.
   Status MakeShapeFromShapeTensor(int input_idx, ShapeHandle* out);
 
+  // Like the function above, but treats scalar values as unknown
+  // shapes.  **NOTE** If the scalar is statically known, its value
+  // must be -1 or an error is returned.
+  Status MakeShapeFromShapeTensorTreatScalarAsUnknownShape(int input_idx,
+                                                           ShapeHandle* out);
+
   // Returns in <out> a new shape corresponding to <proto>.
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
@@ -708,6 +714,11 @@ class InferenceContext {
     merged_dims_.clear();
   }
 
+  // Helper method for MakeShapeFromTensor and MakeShapeFromShapeTensor.
+  Status InternalMakeShapeFromTensor(
+      bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+      ShapeHandle tensor_shape, ShapeHandle* out);
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index da103bfec97b3b487f94c1dfd5de21bcca4717ca..586c38e43bbe75fa0710b11bb7290ee7b3f627d9 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -1081,17 +1081,26 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   t = ::tensorflow::test::AsTensor<int64>({});
   EXPECT_EQ("[]", create(&t));
 
+  // Test negative scalar
+  t = ::tensorflow::test::AsScalar<int32>(-1);
+  EXPECT_EQ("?", create(&t));
+
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
   EXPECT_TRUE(str_util::StrContains(
       create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
+  auto s_scalar = create(&t);
   EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Input tensor must be rank 1, but was rank 0"));
+      s_scalar,
+      "Input tensor must be rank 1, or if its rank 0 it must have value -1"))
+      << s_scalar;
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
+  auto s_matrix = create(&t);
   EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Input tensor must be rank 1, but was rank 2"));
+      s_matrix, "Input tensor must be rank 1, but was rank 2"))
+      << s_matrix;
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index e2111d60389d51702463f377602067ddc1bade08..d5a45c73c37bf0807e9437a4c886ca0d96dc5c67 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -610,11 +610,15 @@ bool Tensor::IsInitialized() const {
 }
 
 void Tensor::CheckType(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
 }
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
   CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index f15e2ce9fa7b02fd07209d0784cd436b8e68f10b..c678283fce1768888cb7730aac8f306336c78c4b 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -666,20 +666,17 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
 void RemoveInputs(const std::vector<int>& inputs_to_remove, NodeDef* node_def,
                   std::vector<bool>* input_already_exists) {
   // Remove 'inputs_to_remove' from 'node_def'
-  // TODO(skyewm): is there a better way to do this?
-  std::vector<string> inputs;
-  inputs.reserve(node_def->input_size());
-  for (int i = 0; i < node_def->input_size(); ++i) {
-    inputs.push_back(node_def->input(i));
-  }
-  node_def->clear_input();
-  for (int i = 0, j = 0; i < inputs.size(); ++i) {
+  NodeDef copy;
+  copy.mutable_input()->Reserve(node_def->input_size() -
+                                inputs_to_remove.size());
+  for (int i = 0, j = 0; i < node_def->input_size(); ++i) {
     if (j < inputs_to_remove.size() && i == inputs_to_remove[j]) {
       ++j;
     } else {
-      node_def->add_input(inputs[i]);
+      copy.add_input()->swap(*node_def->mutable_input(i));
     }
   }
+  node_def->mutable_input()->Swap(copy.mutable_input());
   // Remove 'inputs_to_remove' from 'input_already_exists'
   for (int idx : inputs_to_remove) {
     input_already_exists->erase(input_already_exists->begin() + idx);
@@ -745,9 +742,21 @@ void GraphConstructor::AddControlDependencies(
   // dependencies
   for (const string& control_dep : opts_.control_dependencies) {
     string input = TensorId(control_dep, Graph::kControlSlot).ToString();
-    const protobuf::RepeatedPtrField<string>& inputs = node_def->input();
-    if (std::find(inputs.begin(), inputs.end(), input) != inputs.end()) {
-      // Control dependency already exists
+    bool found = false;
+    for (int i = node_def->input_size() - 1; i >= 0; --i) {
+      const string& node_input = node_def->input(i);
+      if (node_input[0] != '^') {
+        // Control inputs are at the end. Break when we reach the non-control
+        // inputs.
+        break;
+      }
+      if (node_input == input) {
+        // Control dependency already exists
+        found = true;
+        break;
+      }
+    }
+    if (found) {
       continue;
     }
     node_def->add_input(input);
@@ -761,10 +770,10 @@ void GraphConstructor::AddPrefixToNodeDef(
   node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
-    StringPiece input(node_def->input(i));
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
+    StringPiece input(node_def->input(i));
     if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
@@ -933,10 +942,10 @@ Status GraphConstructor::Convert() {
         }
       }
 
-      // TODO(ashankar): The line below means an additional copy of the NodeDef,
-      // which can be expensive if the NodeDef contains large tensors in it.
-      // Might make sense to change the API for ImportGraphDef to take a mutable
-      // GraphDef* and avoid the copying.
+      // TODO(ashankar): The line below means an additional copy of the
+      // NodeDef, which can be expensive if the NodeDef contains large tensors
+      // in it. Might make sense to change the API for ImportGraphDef to take
+      // a mutable GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
         // Note that input_already_exists can shrink here
@@ -980,7 +989,7 @@ Status GraphConstructor::Convert() {
             src_node->num_outputs(), " outputs");
       }
 
-      inputs.push_back(InputInfo(id.first.ToString(), src_node, src_index));
+      inputs.emplace_back(id.first.ToString(), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(*node_def)) {
@@ -1010,8 +1019,7 @@ Status GraphConstructor::Convert() {
       if (inputs[i].node == nullptr) {
         // Record this back edge, which will be added after all nodes
         // are created.
-        back_edges_.push_back(
-            EdgeInfo(inputs[i].name, inputs[i].index, node, i));
+        back_edges_.emplace_back(inputs[i].name, inputs[i].index, node, i);
       } else if (inputs[i].index == Graph::kControlSlot) {
         g_->AddControlEdge(inputs[i].node, node);
       } else {
@@ -1019,12 +1027,7 @@ Status GraphConstructor::Convert() {
       }
     }
 
-    // Function shape inference is supported on an opt-in basis per
-    // ShapeRefiner.
-    if (refiner_->function_shape_inference_supported() ||
-        g_->flib_def().Find(node_def->name()) == nullptr) {
-      TF_RETURN_IF_ERROR(ValidateShape(node));
-    }
+    TF_RETURN_IF_ERROR(ValidateShape(node));
 
     // Update pending_count_ for outputs.
     UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index c6352c1448bb38ece78530007e2534d475ef7fb6..352f08fedecd426c06c8668ff8f3910286e6900a 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -196,10 +196,19 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
   std::set<string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
+#ifdef INTEL_MKL
+    // Skip the special nodes inserted by TF (and MKL): these are either
+    // prefixed with an underscore or contain "/_".
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    cost_nodes.insert(node.name());
+#else
     // Skip nodes added by TF internally.
     if (node.name()[0] != '_') {
       cost_nodes.insert(node.name());
     }
+#endif
   }
   const std::set<string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 3604de392f803b8b2eb65e796848c2c3ec6a90e5..a5736d40b13fc6d38a6ffd64f5daa0f46bd3ba75 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -14,7 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_memory.h"
-#include <list>
+
+#include <deque>
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -120,7 +121,7 @@ int64 GraphMemory::InferMemUsageForNeighbors(
 static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
     const string& node_name, int output_id,
     std::unordered_map<string, GraphMemory::LiveTensor*>* live_tensors,
-    std::list<GraphMemory::LiveTensor>* device_tensors) {
+    std::deque<GraphMemory::LiveTensor>* device_tensors) {
   string name = strings::StrCat(node_name, ":", output_id);
   GraphMemory::LiveTensor* live;
   auto it = live_tensors->find(name);
@@ -141,6 +142,10 @@ static GraphMemory::LiveTensor* FindOrCreateLiveTensor(
 
 namespace {
 struct Event {
+  Event(int64 _timestamp, bool _allocated,
+        const GraphMemory::LiveTensor* _tensor)
+      : timestamp(_timestamp), allocated(_allocated), tensor(_tensor) {}
+
   int64 timestamp;
   bool allocated;
   const GraphMemory::LiveTensor* tensor;
@@ -160,13 +165,15 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
   }
 
   std::unordered_map<string, LiveTensor*> live_tensors;
-  std::unordered_map<string, std::list<LiveTensor>> live_tensors_per_device;
-
-  NodeMap node_map(&item_.graph);
+  std::unordered_map<string, std::deque<LiveTensor>> live_tensors_per_device;
+  std::unordered_map<string, const NodeDef*> node_map;
+  for (const NodeDef& node : item_.graph.node()) {
+    node_map[node.name()] = &node;
+  }
   for (const auto& dev_stats : timeline.dev_stats()) {
     const string& device_name = dev_stats.device();
     const bool is_gpu = (device_name.find("GPU:") || device_name.find("gpu:"));
-    std::list<LiveTensor>& device_tensors =
+    std::deque<LiveTensor>& device_tensors =
         live_tensors_per_device[dev_stats.device()];
     for (const auto& node_stats : dev_stats.node_stats()) {
       for (int i = 0; i < node_stats.output_size(); ++i) {
@@ -191,12 +198,13 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
                                     node_stats.op_end_rel_micros()));
       }
 
-      const NodeDef* node = node_map.GetNode(node_stats.node_name());
-      if (!node) {
+      auto it = node_map.find(node_stats.node_name());
+      if (it == node_map.end()) {
         // Skip nodes inserted by TF since they don't exist in the original
         // graph (e.g _Send/_Recv nodes).
         continue;
       }
+      const NodeDef* node = it->second;
       std::unordered_set<int> swapped_inputs;
       if (is_gpu) {
         auto it = node->attr().find("_swap_to_host");
@@ -237,14 +245,16 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
     std::vector<Event> events;
     events.reserve(2 * live_per_device.second.size());
     for (const auto& live : live_per_device.second) {
-      events.push_back(Event{live.allocation_time.count(), true, &live});
-      events.push_back(Event{live.deallocation_time.count(), false, &live});
+      events.emplace_back(static_cast<int64>(live.allocation_time.count()),
+                          true, &live);
+      events.emplace_back(static_cast<int64>(live.deallocation_time.count()),
+                          false, &live);
     }
     std::stable_sort(events.begin(), events.end());
     size_t peak = 0;
-    std::set<const LiveTensor*> live_at_peak;
+    std::unordered_set<const LiveTensor*> live_at_peak;
     size_t current = 0;
-    std::set<const LiveTensor*> currently_live;
+    std::unordered_set<const LiveTensor*> currently_live;
     for (int i = 0; i < events.size(); ++i) {
       const auto& event = events[i];
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 8fe154dbf3c7d634ec9266b86135f721b25edcc9..9fa2b7a259b1e96e2f9cea1326f3b41aed4533e0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -920,9 +920,9 @@ Status GraphProperties::UpdateResource(
 }
 
 Status GraphProperties::InferStatically(bool assume_valid_feeds) {
-  Graph graph(OpRegistry::Global());
-  FunctionLibraryDefinition function_library(graph.op_registry(),
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
+  Graph graph(function_library);
   ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   shape_refiner.set_disable_constant_propagation(true);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index db4dae96de44cba70221fde551e3f997e4db93cc..d3d89b59af7db2d834cdf7476b106b181352b657 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -742,8 +742,6 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   EXPECT_EQ("float: [128,256]", PropToString(prop));
 }
 
-#if 0
-// Disabled for now since this doesnt' seem to work when functions are instantiated inside while loops. It's also unclear whether it's correct when the same function is instantiated twice.
 TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   // Test graph produced in python using:
   /*
@@ -757,27 +755,26 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
       z = MyAdd(x, y)
       z = MyAdd(x, z)
   */
-  // Check that the shape of the second MyAdd node propagates
-  // correctly.
+  // Check that the shape inference code infers what it can.
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
-  const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
-  const OpInfo::TensorProperties& prop = props[0];
-  EXPECT_EQ(DT_FLOAT, prop.dtype());
-  EXPECT_FALSE(prop.shape().unknown_rank());
-  EXPECT_EQ(2, prop.shape().dim_size());
-  EXPECT_EQ(1, prop.shape().dim(0).size());
-  EXPECT_EQ(2, prop.shape().dim(1).size());
-
-  PartialTensorShape shape(prop.shape());
-  EXPECT_TRUE(shape.IsFullyDefined());
-  EXPECT_FALSE(shape.unknown_rank());
+  const auto out_props = properties.GetOutputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& out_prop = out_props[0];
+  EXPECT_EQ(DT_FLOAT, out_prop.dtype());
+  EXPECT_TRUE(out_prop.shape().unknown_rank());
+
+  const auto in_props = properties.GetInputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& in_prop = in_props[0];
+  EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+  EXPECT_FALSE(in_prop.shape().unknown_rank());
+  EXPECT_EQ(2, in_prop.shape().dim_size());
+  EXPECT_EQ(1, in_prop.shape().dim(0).size());
+  EXPECT_EQ(2, in_prop.shape().dim(1).size());
 }
-#endif
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
   // Build a simple graph with placeholders of unknown dimensions. These
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 79735e6cc213e2925370182f446f5a3df5fabf2b..087190ad2a659a9e3314ef0b49737764888f43be 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -30,6 +30,7 @@ constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
+constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -196,6 +197,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
       {kConv2dBackpropInput,
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
+      {kFusedConv2dBiasActivation,
+       wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
@@ -545,7 +548,6 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -983,6 +985,91 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
+    const OpContext& op_context) const {
+  // FusedConv2DBiasActivation computes a fused kernel which implements:
+  // 2D convolution, adds side input with separate scaling on convolution and
+  // side inputs, then adds bias, and finally applies the ReLU activation
+  // function to the result:
+  //
+  // Input -> Conv2D  ->  Add  -> BiasAdd  -> ReLU
+  //            ^          ^         ^
+  //          Filter   Side Input   Bias
+  //
+  // Note that when adding the side input, the operation multiplies the output
+  // of Conv2D by conv_input_scale, confusingly, and the side_input by
+  // side_input_scale.
+  //
+  // Note that in the special case that side_input_scale is 0, which we infer
+  // from side_input having dimensions [], we skip that addition operation.
+  //
+  // For more information, see
+  // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+  auto& conv_input = op_context.op_info.inputs(0);
+  auto& filter = op_context.op_info.inputs(1);
+  auto& bias = op_context.op_info.inputs(2);
+  auto& side_input = op_context.op_info.inputs(3);
+  auto& conv_input_scale = op_context.op_info.inputs(4);
+  auto& side_input_scale = op_context.op_info.inputs(5);
+
+  // Manually compute our convolution dimensions.
+  bool found_unknown_shapes = false;
+  auto dims = ConvolutionDimensionsFromInputs(
+      conv_input.shape(), filter.shape(), op_context.op_info,
+      &found_unknown_shapes);
+
+  // Construct the shape of our output tensor from our convolution dimensions
+  // and format, as it may not be available yet.
+  //
+  // TODO(varomodt): should we centralize the Conv2D input/output shapes?
+  bool unknown_conv_format = false;
+  OpInfo::TensorProperties output;
+  switch (GetConvolutionFormat(op_context)) {
+    case NCHW:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.ox, dims.oy});
+      break;
+    case NHWC:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      break;
+    default:
+      // TODO(b/77722245): support cost estimation for NCHW_VECT_C.
+      LOG(WARNING) << "unsupported data format: "
+                   << GetDataFormat(op_context.op_info)
+                   << " Defaulting to NHWC.";
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      unknown_conv_format = true;
+      break;
+  }
+
+  // Add the operations the fused op always computes.
+  std::vector<OpContext> component_ops = {
+      FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
+      FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
+      FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
+      FusedChildContext(op_context, "Relu", output, {output})};
+
+  // Add our side_input iff it's non-empty.
+  if (side_input.shape().dim_size() > 0) {
+    component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
+                                              {side_input, side_input_scale}));
+    component_ops.push_back(
+        FusedChildContext(op_context, "Add", output, {side_input, output}));
+  }
+
+  // Construct an op_context which definitely has our output shape.
+  auto op_context_with_output = op_context;
+  op_context_with_output.op_info.mutable_outputs()->Clear();
+  *op_context_with_output.op_info.mutable_outputs()->Add() = output;
+
+  // Construct component operations and run the cost computation.
+  auto costs = PredictFusedOp(op_context_with_output, component_ops);
+  costs.inaccurate |= found_unknown_shapes || unknown_conv_format;
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
   bool found_unknown_shapes = false;
@@ -1086,6 +1173,66 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedOp(
+    const OpContext& op_context,
+    const std::vector<OpContext>& fused_op_contexts) const {
+  // Note that PredictOpCountBasedCost will get the correct memory_time from
+  // the node's inputs and outputs; but we don't want to have to re-implement
+  // the logic for computing the operation count of each of our component
+  // operations here; so we simply add the compute times of each component
+  // operation, then update the execution time.
+  Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);
+  fused_cost.compute_time = 0;
+  fused_cost.inaccurate = false;
+  for (auto& fused_op : fused_op_contexts) {
+    auto op_cost = PredictCosts(fused_op);
+    fused_cost.compute_time += op_cost.compute_time;
+    fused_cost.inaccurate |= op_cost.inaccurate;
+  }
+
+  CombineCostsAndUpdateExecutionTime(&fused_cost);
+  return fused_cost;
+}
+
+/* static */
+OpContext OpLevelCostEstimator::FusedChildContext(
+    const OpContext& parent, const string& op_name,
+    const OpInfo::TensorProperties& output,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  // Setup the base parameters of our new context.
+  OpContext new_context;
+  new_context.name = op_name;
+  new_context.device_name = parent.device_name;
+  new_context.op_info = parent.op_info;
+  new_context.op_info.set_op(op_name);
+
+  // Setup the inputs of our new context.
+  new_context.op_info.mutable_inputs()->Clear();
+  for (const auto& input : inputs) {
+    *new_context.op_info.mutable_inputs()->Add() = input;
+  }
+
+  // Setup the output of our new context.
+  new_context.op_info.mutable_outputs()->Clear();
+  *new_context.op_info.mutable_outputs()->Add() = output;
+
+  return new_context;
+}
+
+/* static */
+OpInfo::TensorProperties OpLevelCostEstimator::DescribeTensor(
+    DataType type, const std::vector<int64>& dims) {
+  OpInfo::TensorProperties ret;
+  ret.set_dtype(type);
+
+  auto shape = ret.mutable_shape();
+  for (const int dim : dims) {
+    shape->add_dim()->set_size(dim);
+  }
+
+  return ret;
+}
+
 /* static */
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::OpDimensionsFromInputs(
@@ -1371,6 +1518,21 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   return costs;
 }
 
+/* static */
+OpLevelCostEstimator::ConvolutionFormat
+OpLevelCostEstimator::GetConvolutionFormat(const OpContext& op_context) {
+  auto data_format = GetDataFormat(op_context.op_info);
+  if (data_format == "NCHW") {
+    return NCHW;
+  } else if (data_format == "NHWC") {
+    return NHWC;
+  } else if (data_format == "NCHW_VECT_C") {
+    return NCHW_VECT_C;
+  }
+
+  return UNKNOWN_CONVOLUTION_FORMAT;
+}
+
 void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     Costs* costs) const {
   if (compute_memory_overlap_) {
@@ -1379,6 +1541,5 @@ void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     costs->execution_time = costs->compute_time + costs->memory_time;
   }
 }
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 70802646985e6d65577daa97b92a4fe5177d7f40..35649f7ee959a292dbf68246221bc98c52f2db37 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -82,6 +82,13 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
+  enum ConvolutionFormat {
+    UNKNOWN_CONVOLUTION_FORMAT,
+    NHWC,
+    NCHW,
+    NCHW_VECT_C,
+    NCHW_VECT_W,
+  };
   int64 CountConv2DOperations(const OpInfo& op_features,
                               bool* found_unknown_shapes) const;
   int64 CountConv2DOperations(const OpInfo& op_features,
@@ -138,6 +145,7 @@ class OpLevelCostEstimator {
   Costs PredictCwiseOp(const OpContext& op_context) const;
   Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
+  Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
   Costs PredictIdentity(const OpContext& op_context) const;
@@ -152,6 +160,10 @@ class OpLevelCostEstimator {
   Costs PredictFusedBatchNorm(const OpContext& op_context) const;
   Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
 
+  // Generic cost prediction method for fused operations.
+  Costs PredictFusedOp(const OpContext& op_context,
+                       const std::vector<OpContext>& fused_op_contexts) const;
+
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
   static double SafeDiv(const double lhs, const double rhs) {
@@ -173,6 +185,20 @@ class OpLevelCostEstimator {
       const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // Helper to construct child operation contexts for the component operations
+  // of fused ops.
+  static OpContext FusedChildContext(
+      const OpContext& parent, const string& op_name,
+      const OpInfo::TensorProperties& output,
+      const std::vector<OpInfo::TensorProperties>& inputs);
+
+  // Helper to construct tensor shapes.
+  static OpInfo::TensorProperties DescribeTensor(
+      DataType type, const std::vector<int64>& dims);
+
+  // Returns the Conv2D format for this operation.
+  static ConvolutionFormat GetConvolutionFormat(const OpContext& op_context);
+
   // This method calculates the execution time depending on whether IO can
   // overlap with computation. It assumes the memory and the compute times have
   // already been calculated.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index d797a8a8c1943133f7c92f01eb9a61ae0d1e3b4f..13ea43bed692828f00e89b7f964c3abcdcdb6483 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -93,6 +93,14 @@ OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
   return op_context;
 }
 
+// Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
+// estimation purposes.
+void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
+  auto shape = tensor->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  tensor->set_dtype(DT_FLOAT);
+}
+
 // Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
 // estimation purposes.
 void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
@@ -120,6 +128,38 @@ OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
   return op_context;
 }
 
+// DescribeFusedConv2DBiasActivation constructs an OpContext for a
+// FusedConv2DBiasActivation applied to a convolution input tensor with shape
+// (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
+// bias tensor with shape (oz), a side input tensor with shape
+// (batch, ox, oy, oz) if has_side_input is set, and two scaling tensors with
+// shape (1).
+//
+// Note that this assumes the NHWC data format.
+OpContext DescribeFusedConv2DBiasActivation(int batch, int ix, int iy, int iz1,
+                                            int iz2, int kx, int ky, int ox,
+                                            int oy, int oz,
+                                            bool has_side_input) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("FusedConv2DBiasActivation");
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, oz, op_context.op_info.add_inputs());
+  DescribeTensor1D(oz, op_context.op_info.add_inputs());
+
+  // Add the side_input, if any.
+  auto side_input = op_context.op_info.add_inputs();
+  if (has_side_input) {
+    DescribeTensor4D(batch, ox, oy, oz, side_input);
+  }
+
+  // Add the scaling tensors.
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
 // DescribeUnaryOp constructs an OpContext for the given operation applied to
 // a 4-tensor with shape (size1, 1, 1, 1).
 OpContext DescribeUnaryOp(const string& op, int size1) {
@@ -162,12 +202,9 @@ OpContext DescribeBiasAdd(int size1, int size2) {
   op_context.op_info.set_op("BiasAdd");
 
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_inputs());
+  DescribeTensor1D(size1, op_context.op_info.add_inputs());
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_outputs());
 
-  auto bias = op_context.op_info.add_inputs();
-  bias->mutable_shape()->add_dim()->set_size(size1);
-  bias->set_dtype(DT_FLOAT);
-
   return op_context;
 }
 
@@ -486,6 +523,25 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
+TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true));
+  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest,
+       FusedConv2DBiasActivationNoSideInputExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ false));
+  EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   auto cost = PredictCosts(DescribeBinaryOp("Mul", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
index 9866bc86887e2fa1a1fcfe95e3e9673b7df1a8f3..5151b87c59cc09934871b225c70e785c8f9093dd 100644
--- a/tensorflow/core/grappler/costs/robust_stats.cc
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include <algorithm>
 #include <cmath>
+#include <utility>
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index ad86356504e06d31ccc0a315fbd6991e49df0f19..bbc0fedd22ba640c38556eb3267cf0a82686d0ea 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef) {
+GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
   id = other.id;
   feed = other.feed;
   fetch = other.fetch;
@@ -38,7 +38,7 @@ GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef) {
   restore_op = other.restore_op;
   save_restore_loc_tensor = other.save_restore_loc_tensor;
   queue_runners = other.queue_runners;
-  graph.Swap(&graphDef);
+  graph.Swap(graph_def);
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 06bba544c315476219ee83684df59a3da8720eea..cd165ac3d460fb0dbd644561efaabe91b0cd0aea 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -33,10 +33,13 @@ namespace grappler {
 // A TensorFlow model to optimize.
 // Models are represented by the combination of a graph, one of more fetch
 // nodes, and potentially a set of nodes to feed.
-// TODO(volunteer_needed): turn this struct into a class.
 struct GrapplerItem {
-  GrapplerItem() {}
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graphDef);
+  GrapplerItem() = default;
+  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
+      : GrapplerItem(other, &graph_def) {}
+  // Swaps *graph_def with an empty GraphDef.
+  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
+  virtual ~GrapplerItem() = default;
 
   string id;  // A unique id for this item
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index a24d2dbd9f767e7973651f6c954f78c7d80d7978..9c45aed62ffa9e34463e05c3bdfb292ba0d1a3a3 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -245,6 +245,8 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
+bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
@@ -454,15 +456,38 @@ bool IsInvolution(const NodeDef& node) {
   return involution_ops.count(node.op()) > 0;
 }
 
-bool IsValuePreserving(const NodeDef& node) {
+bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
+  const std::unordered_set<string> value_and_order_preserving_ops{
+      "CheckNumerics",
+      "DebugGradientIdentity",
+      "DeepCopy"
+      "Enter",
+      "Exit",
+      "ExpandDims",
+      "Identity",
+      "IdentityN",
+      "PreventGradient",
+      "Print",
+      "Reshape",
+      "Snapshot",
+      "Squeeze",
+      "StopGradient",
+  };
+  return value_and_order_preserving_ops.count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
   const std::unordered_set<string> value_preserving_ops{
-      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
-      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
-      "ExpandDims", "Squeeze"};
-  return value_preserving_ops.count(node.op()) > 0;
+      "InvertPermutation",
+      "Reverse",
+      "Roll",
+      "Transpose",
+  };
+  return IsValueAndOrderPreserving(node) ||
+         value_preserving_ops.count(node.op()) > 0;
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 8667f72c7ecd213d61c92edabc62610a7e7f1595..79fd05e1870684c0c7ab598a90999890b6e24cf6 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -95,6 +95,7 @@ bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
 bool IsPolygamma(const NodeDef& node);
+bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
 bool IsReal(const NodeDef& node);
@@ -167,6 +168,10 @@ bool ModifiesInputsInPlace(const NodeDef& node);
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
 
+// Returns true if the op preserves the order and value of elements in its
+// first input tensor and possible changes its shape.
+bool IsValueAndOrderPreserving(const NodeDef& node);
+
 // Returns true if the op in node only rearranges the order of elements in its
 // first input tensor and possible changes its shape. More precisely, this
 // function returns true if the op commutes with all element-wise operations.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 122fd48584f6170da7ff175f4a8b5ed2e436623f..96342fedc17d41e84cd338c92c14c432d96ac15d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -11,6 +11,10 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_protos_grappler",
 )
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
 
 cc_library(
     name = "static_schedule",
@@ -357,9 +361,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -477,9 +483,13 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -531,11 +541,28 @@ tf_cuda_cc_test(
     ],
 )
 
+# This rule is header-only unless the build is static (--config=monolithic). Its
+# implementation is included directly in the framework shared object.
 cc_library(
     name = "custom_graph_optimizer_registry",
-    srcs = ["custom_graph_optimizer_registry.cc"],
     hdrs = ["custom_graph_optimizer_registry.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ] + if_static(
+        [":custom_graph_optimizer_registry_impl"],
+    ),
+)
+
+# This rule contains static variables for the optimizer registry. Do not depend
+# on it directly; use :custom_graph_optimizer_registry, and link against
+# libtensorflow_framework.so for the registry symbols.
+cc_library(
+    name = "custom_graph_optimizer_registry_impl",
+    srcs = ["custom_graph_optimizer_registry.cc"],
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":custom_graph_optimizer",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 7bf264ba3051f75203a4730b59178152504fe111..b80ae5fa407ce5dba1c7b5221203a066811c72c6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -253,9 +253,8 @@ NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
   auto is_value_preserving_non_branching = [&](const NodeDef& node) {
-    return IsValuePreserving(node) &&
-           NumNonControlOutputs(node, node_map) == 1 &&
-           nodes_to_preserve.count(node.name()) == 0;
+    return nodes_to_preserve.find(node.name()) == nodes_to_preserve.end() &&
+           IsValuePreserving(node) && NumNonControlOutputs(node, node_map) == 1;
   };
   return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
                         is_value_preserving_non_branching);
@@ -279,6 +278,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
         ctx_ext_(ctx_ext) {}
   virtual ~ArithmeticOptimizerStage() = default;
 
+ protected:
   // Simplification graph rewrite can create additional nodes that are inputs
   // to final simplified node, they can be also added to the arithmetic
   // optimizer queue for further optimization.
@@ -304,10 +304,176 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
   }
 
  private:
-  // extened context required for ArithmeticOptimizer
+  // Extended context required for ArithmeticOptimizer.
   const ArithmeticOptimizerContext ctx_ext_;
 };
 
+// Subtype of ArithmeticOptimizerStage that does optimization by rewriting a
+// group of nodes from the optimized graph.
+//
+// * AddOpsRewrite:
+//   Rewrite a group of Add/AddN with compact Add/AddN tree
+//
+// * MinimizeBroadcasts:
+//   Rewrite a group of binary associative ops, reordering
+//   inputs, to minimize the cost of broadcast
+class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
+ public:
+  explicit ArithmeticNodesGroupOptimizerStage(
+      const string& name, const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext ctx_ext)
+      : ArithmeticOptimizerStage(name, ctx, ctx_ext), optimized_nodes_{} {}
+  ~ArithmeticNodesGroupOptimizerStage() override = default;
+
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
+
+  // Subgraph (subtree) of nodes, that we want to optimize in "one shot" (e.g.
+  // all the Add nodes that we plan to rewrite with a single AddN). Subgraph is
+  // obtained by graph traversal, starting from a root node.
+  struct OptimizedNodesGroup {
+    NodeDef* root_node;
+    TensorShapeProto root_shape;
+    // Optimized nodes that will be updated or removed by rewrite
+    std::vector<NodeDef*> optimized_nodes;
+    // Inputs to optimized nodes
+    std::vector<InputAndShape> inputs;
+  };
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    OptimizedNodesGroup group;
+    TF_RETURN_IF_ERROR(CreateOptimizedNodesGroup(node, &group));
+
+    if (!group.optimized_nodes.empty()) {
+      *simplified_node_name = RewriteOptimizedNodesGroup(group);
+    }
+
+    return Status::OK();
+  }
+
+ protected:
+  // Modify the optimized graph after nodes group was successfully identified
+  virtual string RewriteOptimizedNodesGroup(
+      const OptimizedNodesGroup& group) = 0;
+
+  // Check if input can become a part of current optimized nodes group.
+  virtual bool IsAbsorbableByOptimizedNodesGroup(
+      const OptimizedNodesGroup& group, const string& input) const = 0;
+
+  Status AbsorbInputByOptimizedNodesGroup(const string& input,
+                                          OptimizedNodesGroup* group) const {
+    NodeDef* node;
+    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
+
+    if (IsAbsorbableByOptimizedNodesGroup(*group, input)) {
+      for (int i = 0; i < node->input_size(); ++i) {
+        const string& input_i = node->input(i);
+        if (!IsControlInput(input)) {
+          TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+        }
+      }
+      group->optimized_nodes.push_back(node);
+    } else {
+      // If node can't be absorbed, add it to OptimizedNodesGroup input
+      OpInfo::TensorProperties properties;
+      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
+      group->inputs.emplace_back(input, properties.shape());
+    }
+    return Status::OK();
+  }
+
+  Status CreateOptimizedNodesGroup(NodeDef* root_node,
+                                   OptimizedNodesGroup* group) const {
+    OpInfo::TensorProperties root_node_output_properties;
+    TF_RETURN_IF_ERROR(
+        GetTensorProperties(root_node->name(), &root_node_output_properties));
+
+    group->root_node = root_node;
+    group->root_shape = root_node_output_properties.shape();
+
+    group->optimized_nodes.reserve(root_node->input_size());
+    for (int i = 0; i < root_node->input_size(); ++i) {
+      const string& input_i = root_node->input(i);
+      if (!IsControlInput(input_i)) {
+        TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Check if all inputs can be broadcasted to the same shape
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool HasAllInputsBroadcastableToShape(
+      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
+    auto is_broadcastable = [this, &properties](const string& input) {
+      OpInfo::TensorProperties input_props;
+      Status has_input_properties = GetTensorProperties(input, &input_props);
+      return has_input_properties.ok() &&
+             ShapesBroadcastable(properties, input_props);
+    };
+    return std::all_of(node.input().begin(), node.input().end(),
+                       is_broadcastable);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool IsDrivenByControlDependency(const NodeDef& node) const {
+    return std::any_of(node.input().begin(), node.input().end(),
+                       IsControlInput);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool DrivesControlDependency(const NodeDef& node) const {
+    int position;
+    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        auto input = output->input(i);
+        string name = ParseNodeName(input, &position);
+        if (name == node.name() && /*control input*/ position < 0) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  string ShapeSignature(const TensorShapeProto& shape) const {
+    string signature = strings::StrCat("rank:", shape.dim_size(), ":dim");
+    for (int i = 0; i < shape.dim_size(); ++i)
+      strings::StrAppend(&signature, ":", shape.dim(i).size());
+    return signature;
+  }
+
+  void AddToOptimizedNodes(const NodeDef* node) {
+    optimized_nodes_.insert(node->name());
+  }
+
+  bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
+                         const NodeDef& node) const {
+    return group.root_node->device() == node.device();
+  }
+
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx_.nodes_to_preserve->find(node.name()) !=
+           ctx_.nodes_to_preserve->end();
+  }
+
+  bool IsAlreadyOptimized(const NodeDef& node) const {
+    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  }
+
+ private:
+  // set of nodes already processed by this optimizer stage
+  std::unordered_set<string> optimized_nodes_;
+};
+
 // Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
 // original inputs of absorbed nodes.
 //
@@ -335,110 +501,33 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 //         x    y      w    Add_3      AddN(x, y, q, e)  z
 //                          / \
 //                         q   e
-class AddOpsRewriteStage : public ArithmeticOptimizerStage {
+class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
  public:
   explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
                               const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("AddOpsRewrite", ctx, ctx_ext),
-        rewritten_nodes_() {}
-
+      : ArithmeticNodesGroupOptimizerStage("AddOpsRewrite", ctx, ctx_ext) {}
   ~AddOpsRewriteStage() override = default;
 
   // Check if a node can become a root of AddOpsGroup
   bool IsSupported(const NodeDef* node) const override {
-    // check basic preconditions
-    if (!IsRewritable(node)) {
-      return false;
-    }
+    if (!CanOptimize(node)) return false;
 
     // shape must be symbolically defined and all inputs compatible with it
     OpInfo::TensorProperties properties;
     Status has_properties = GetTensorProperties(node->name(), &properties);
     return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsOfBroadcastableShape(*node, properties);
+           HasAllInputsBroadcastableToShape(*node, properties);
   }
 
-  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
-    AddOpsGroup group;
-    TF_RETURN_IF_ERROR(CreateAddOpsGroup(node, &group));
-
-    if (!group.absorbed_nodes.empty()) {
-      *simplified_node_name = RewriteAddOpsGroup(group);
-    }
-
-    return Status::OK();
-  }
-
- private:
-  // Input name with a statically inferred shape from GraphProperties
-  struct InputAndShape {
-    InputAndShape(const string& input, const TensorShapeProto& shape)
-        : input(input), shape(shape) {}
-    string input;
-    TensorShapeProto shape;
-  };
-
-  // Holds together an add ops subgraph that we want to rewrite together.
-  //
-  // For the graph above the AddOpsGroup will be:
-  //   root_node: AddN_1
-  //   absorbed_nodes: [Add_1, Add_2]
-  //   input_nodes: [x, y, z, w, q, e]
-  struct AddOpsGroup {
-    const NodeDef* root_node;
-    TensorShapeProto root_shape;
-    // Add/AddN operations below the root level that were absorbed by this group
-    std::vector<NodeDef*> absorbed_nodes;
-    // Inputs of absorbed nodes that will be forwarded to optimized AddN ops
-    std::vector<InputAndShape> inputs;
-  };
-
-  // Check if all inputs can be broadcasted to the same shape
-  bool HasAllInputsOfBroadcastableShape(
-      const NodeDef& node, const OpInfo::TensorProperties& properties) const {
-    const AddOpsRewriteStage* self = this;
-    return std::all_of(
-        node.input().begin(), node.input().end(),
-        [self, &properties](const string& input) {
-          OpInfo::TensorProperties input_properties;
-          Status has_input_properties =
-              self->GetTensorProperties(input, &input_properties);
-          return has_input_properties.ok() &&
-                 ShapesBroadcastable(properties, input_properties);
-        });
-  }
-
-  // TODO(ezhulenev): use GraphRewriter?
-  bool IsDrivenByControlDependency(const NodeDef& node) const {
-    return std::any_of(node.input().begin(), node.input().end(),
-                       IsControlInput);
-  }
-
-  // TODO(ezhulenev): use GraphRewriter?
-  bool DrivesControlDependency(const NodeDef& node) const {
-    int position;
-    for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
-      for (int i = 0; i < output->input_size(); ++i) {
-        auto input = output->input(i);
-        string name = ParseNodeName(input, &position);
-        if (name == node.name() && /*control input*/ position < 0) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
-  // Check if a node can be absorbed by current AddOpsGroup
-  bool IsAbsorbableByAddOpsGroup(const string& name, const AddOpsGroup& group) {
+ protected:
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
     NodeDef* node;
-    Status node_status = GetInputNode(name, &node);
-    if (!node_status.ok()) {
-      return false;
-    }
-    // check basic preconditions
-    if (!IsRewritable(node)) {
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok() || !CanOptimize(node)) return false;
+
+    if (!IsOnTheSameDevice(group, *node)) {
       return false;
     }
     // with a single output data consumer (presumably if we reach this node from
@@ -447,102 +536,42 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (NumNonControlDataOutputs(*node, *ctx_.node_map) != 1) {
       return false;
     }
-    // must be on the same device as a root node
-    if (node->device() != group.root_node->device()) {
-      return false;
-    }
     // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(name, &properties);
+    Status has_properties = GetTensorProperties(input, &properties);
     return has_properties.ok() &&
-           HasAllInputsOfBroadcastableShape(*node, properties);
+           HasAllInputsBroadcastableToShape(*node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
-  bool IsRewritable(const NodeDef* node) const {
-    // only Add or AddN can be a root node
+  bool CanOptimize(const NodeDef* node) const {
     // TODO(ezhulenev): check if AccumulateNV2 can be supported too
     if (!IsAdd(*node) && !IsAddN(*node)) {
       return false;
     }
-    // it must not be in a preserve set
-    if (ctx_.nodes_to_preserve->find(node->name()) !=
-        ctx_.nodes_to_preserve->end()) {
-      return false;
-    }
-    // it must not be a node created or absorbed by previous iteration
-    if (rewritten_nodes_.find(node->name()) != rewritten_nodes_.end()) {
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
       return false;
     }
     // it must not be created by this stage at any of previous optimization runs
-    if (StringPiece(node->name()).contains(stage_name_)) {
+    if (str_util::StrContains(node->name(), stage_name_)) {
       return false;
     }
-    // should not drive or be driven by control dependency
     // TODO(ezhulenev): relax this condition for root node
     return !(IsDrivenByControlDependency(*node) ||
              DrivesControlDependency(*node));
   }
 
-  // Create an AddOpsGroup with a root in a given node
-  Status CreateAddOpsGroup(const NodeDef* root_node, AddOpsGroup* group) {
-    OpInfo::TensorProperties root_node_output_properties;
-    TF_RETURN_IF_ERROR(
-        GetTensorProperties(root_node->name(), &root_node_output_properties));
-
-    group->root_node = root_node;
-    group->root_shape = root_node_output_properties.shape();
-
-    group->absorbed_nodes.reserve(root_node->input_size());
-    for (int i = 0; i < root_node->input_size(); ++i) {
-      const string& input_i = root_node->input(i);
-      if (!IsControlInput(input_i)) {
-        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status AbsorbInputByAddOpsGroup(const string& input, AddOpsGroup* group) {
-    NodeDef* node;
-    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
-
-    if (IsAbsorbableByAddOpsGroup(input, *group)) {
-      group->absorbed_nodes.push_back(node);
-      for (int i = 0; i < node->input_size(); ++i) {
-        const string& input_i = node->input(i);
-        if (!IsControlInput(input)) {
-          TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(input_i, group));
-        }
-      }
-    } else {
-      // If node can't be absorbed, add it to AddOpsGroup input
-      OpInfo::TensorProperties properties;
-      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
-      group->inputs.emplace_back(input, properties.shape());
-    }
-    return Status::OK();
-  }
-
-  // Rewrite an add ops group into a single AddN if all input shapes are
+  // Rewrite a group of add ops into a single AddN if all input shapes are
   // symbolically equal. If not, create AddN for equal shapes first, and then
   // build an Add tree, minimizing the cost of broadcasts.
-  string RewriteAddOpsGroup(const AddOpsGroup& group) {
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
     // all new nodes will be placed under the scope of a root node
     auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
 
-    auto shape_sig = [](const TensorShapeProto& shape) {
-      string name = strings::StrCat("r:", shape.dim_size(), ":d");
-      for (int i = 0; i < shape.dim_size(); ++i)
-        strings::StrAppend(&name, ":", shape.dim(i).size());
-      return name;
-    };
-
     // Find what shapes are present in the inputs of absorbed nodes
     std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
     for (const auto& input : group.inputs) {
-      shape_sig_to_inputs[shape_sig(input.shape)].push_back(input);
+      shape_sig_to_inputs[ShapeSignature(input.shape)].push_back(input);
     }
 
     // Collect all the shapes from representative elements
@@ -556,8 +585,6 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
       string node_name = OptimizedNodeName(root_scope_and_name);
       AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
                                         group.inputs);
-      // keep track of nodes that were created or absorbed as a part of rewrite
-      rewritten_nodes_.insert(node_name);
       return node_name;
     }
 
@@ -586,7 +613,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     // Prepare leaf AddN nodes for inputs of equal shape
     for (int i = 0; i < shapes.size(); ++i) {
       const auto node_name = leaf_node_name(i);
-      const auto& inputs = shape_sig_to_inputs[shape_sig(shapes[i])];
+      const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
       add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
                                                           node_name, inputs));
     }
@@ -637,7 +664,7 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
       node->add_input(inputAndShape.input);
     }
 
-    rewritten_nodes_.insert(node_name);
+    AddToOptimizedNodes(node);
     return InputAndShape(node_name, shape);
   }
 
@@ -661,24 +688,26 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     node->add_input(left.input);
     node->add_input(right.input);
 
-    rewritten_nodes_.insert(node_name);
+    AddToOptimizedNodes(node);
     return InputAndShape(
         node_name, TensorShapeProto());  // shape is not important at this point
   }
-
-  // keep nodes that were added or absorbed as a part of AddOpsGroup rewrite
-  std::unordered_set<string> rewritten_nodes_;
 };
 
-// Use the commutativity and (left- and right-) distributive property of
-// multiplication over addition to hoist common factors out of aggregate nodes
-// where all the inputs are Mul nodes. This pattern occurs frequently in
-// regularization terms for the gradients during training.
+// Use the distributive property of multiplication and division over addition,
+// along with commutativity of the former, to hoist common factors/denominators
+// out of aggregate nodes where ALL the inputs are Mul/Div nodes.
+// This pattern occurs frequently in regularization terms for the gradients
+// during training.
 //
 // For example, we can rewrite an expression of the form:
 //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
 // to the following:
 //   Mul(x, AddN(y1, y2, y3, ... yn))
+// For division, we can rewrite
+//   AddN(Div(y1, x), Div(y2, x), Div(y3, x), ... Div(yn, x))
+// to:
+//   Div(AddN(y1, y2, y3, ... yn), x)
 class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
  public:
   explicit HoistCommonFactorOutOfAggregation(
@@ -693,11 +722,13 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
+    bool common_factor_is_denominator = false;
     std::set<string> common_factors;
     std::vector<string> ctrl_deps;
-    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors, &ctrl_deps));
+    TF_RETURN_IF_ERROR(GetCommonFactors(
+        node, &common_factors, &common_factor_is_denominator, &ctrl_deps));
 
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
@@ -705,24 +736,31 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       // Gather up the non-shared factors
       bool shapes_match = true;
       std::vector<string> unique_factors;
-      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor, &shapes_match,
-                                          &unique_factors));
+      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor,
+                                          common_factor_is_denominator,
+                                          &shapes_match, &unique_factors));
 
       if (shapes_match) {
         NodeDef* input_0;
         TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input_0));
 
-        // Use a copy of the first Mul node for the outer multiplication.
-        NodeDef* new_mul_node = AddCopyNode(OuterMulNodeName(node), input_0);
+        // Use a copy of the first node for the outer multiplication/division.
+        NodeDef* new_outer_node = AddCopyNode(
+            OuterNodeName(node, common_factor_is_denominator), input_0);
         // And a copy of aggregation node as one of the inner operands
         NodeDef* new_add_node = AddCopyNode(InnerAddNodeName(node), node);
 
-        new_mul_node->set_device(node->device());
-        new_mul_node->set_input(0, common_factor);
-        new_mul_node->set_input(1, new_add_node->name());
+        new_outer_node->set_device(node->device());
+        if (common_factor_is_denominator) {
+          new_outer_node->set_input(0, new_add_node->name());
+          new_outer_node->set_input(1, common_factor);
+        } else {
+          new_outer_node->set_input(0, common_factor);
+          new_outer_node->set_input(1, new_add_node->name());
+        }
 
-        ctx_.node_map->AddOutput(common_factor, new_mul_node->name());
-        ctx_.node_map->AddOutput(new_add_node->name(), new_mul_node->name());
+        ctx_.node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
@@ -741,17 +779,18 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         AddToOptimizationQueue(new_add_node);
         // do not optimize the same node twice
         rewritten_nodes_.insert(node->name());
-        *simplified_node_name = new_mul_node->name();
+        *simplified_node_name = new_outer_node->name();
       }
     }
     return Status::OK();
   }
 
  private:
-  // Get a name for new outer Mul node
-  string OuterMulNodeName(const NodeDef* node) const {
+  // Get a name for new outer node
+  string OuterNodeName(const NodeDef* node, bool is_div) const {
     auto scope_and_name = ParseNodeScopeAndName(node->name());
-    return OptimizedNodeName(scope_and_name, "Mul");
+    return is_div ? OptimizedNodeName(scope_and_name, "Div")
+                  : OptimizedNodeName(scope_and_name, "Mul");
   }
 
   // Get a name new inner Add node
@@ -760,11 +799,17 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     return OptimizedNodeName(scope_and_name, "Add");
   }
 
-  // Determine the set of common factors if the input nodes are all Mul nodes.
+  // Determine the set of common factors if the input nodes are all Mul or
+  // Div nodes.
   Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          bool* common_factor_is_denominator,
                           std::vector<string>* ctrl_deps) const {
     CHECK(common_factors->empty());
+    CHECK_NOTNULL(common_factor_is_denominator);
+    *common_factor_is_denominator = false;
 
+    bool has_mul = false;
+    bool has_div = false;
     for (int i = 0; i < node->input_size(); ++i) {
       if (i > 0 && common_factors->empty()) break;
       if (IsControlInput(node->input(i))) {
@@ -774,12 +819,36 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       NodeDef* input;
       TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
 
-      if (!IsMul(*input)) {
+      if ((!IsMul(*input) && !IsAnyDiv(*input)) || (IsMul(*input) && has_div) ||
+          (IsAnyDiv(*input) && has_mul)) {
+        // Break if input is neither a Mul or Div, or if there are both Mul &
+        // Div Ops.
         common_factors->clear();
         break;
+      } else if (IsAnyDiv(*input)) {
+        has_div = true;
+        // In case of possible common dividers, we avoid hoisting out if any
+        // input is not float/double, since integer division is not distributive
+        // over addition.
+        OpInfo::TensorProperties properties0, properties1;
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(0), &properties0));
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(1), &properties1));
+        if (properties0.dtype() != DT_FLOAT &&
+            properties0.dtype() != DT_DOUBLE &&
+            properties1.dtype() != DT_FLOAT &&
+            properties1.dtype() != DT_DOUBLE) {
+          common_factors->clear();
+          break;
+        }
+      } else if (IsMul(*input)) {
+        has_mul = true;
       }
 
-      std::set<string> factors_i{input->input(0), input->input(1)};
+      // We only focus on common factors from denominators if any Op is a
+      // Div.
+      std::set<string> factors_i =
+          has_mul ? std::set<string>{input->input(0), input->input(1)}
+                  : std::set<string>{input->input(1)};
       if (i == 0) {
         std::swap(*common_factors, factors_i);
       } else {
@@ -794,6 +863,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         ctrl_deps->push_back(input->input(i));
       }
     }
+
+    *common_factor_is_denominator = has_div;
     return Status::OK();
   }
 
@@ -802,6 +873,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   // have the same shape since the other aggregation ops do not support
   // broadcasting.
   Status GetUniqueFactors(const NodeDef* node, const string& common_factor,
+                          const bool common_factor_is_denominator,
                           bool* shapes_match,
                           std::vector<string>* unique_factors) const {
     *shapes_match = true;
@@ -812,11 +884,13 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       if (IsControlInput(input)) {
         break;
       }
-      NodeDef* mul_node;
-      TF_RETURN_IF_ERROR(GetInputNode(input, &mul_node));
+      NodeDef* inner_node;
+      TF_RETURN_IF_ERROR(GetInputNode(input, &inner_node));
       const int unique_factor_index =
-          mul_node->input(0) == common_factor ? 1 : 0;
-      unique_factors->push_back(mul_node->input(unique_factor_index));
+          common_factor_is_denominator
+              ? 0
+              : (inner_node->input(0) == common_factor ? 1 : 0);
+      unique_factors->push_back(inner_node->input(unique_factor_index));
       if (i > 0 && !IsAdd(*node)) {
         OpInfo::TensorProperties lhs;
         OpInfo::TensorProperties rhs;
@@ -832,13 +906,209 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     // if graph rewrite happens in multiple passes without graph pruning between
     // them, it's possible that rewritten node already exists in a graph
     return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
-           ctx_.node_map->NodeExists(OuterMulNodeName(node));
+           ctx_.node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx_.node_map->NodeExists(OuterNodeName(node, true));
   }
 
   // keep names of the nodes that were optimized by this stage
   std::unordered_set<string> rewritten_nodes_;
 };
 
+// Binary associative ops can be re-ordered to minimize the number of broadcasts
+// and the size of a temporary tensors.
+//
+// Example: [a, c] - scalars, [b, d] - matrices
+//   @ - binary associative op (Add or Mul)
+//   @* - broadcast
+//
+//           @                      @*
+//        /     \                /      \
+//      @*       @*      ->     @        @
+//    /   \    /   \          /   \    /   \
+//   a     b  c     d        a     c  b     d
+class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit MinimizeBroadcasts(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("MinimizeBroadcasts", ctx, ctx_ext) {
+  }
+  ~MinimizeBroadcasts() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsBinaryAssociative(*node)) return false;
+
+    // has a symbolically defined shape with broadcastable inputs
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  bool IsBinaryAssociative(const NodeDef& node) const {
+    return IsMul(node) || IsAdd(node);
+  }
+
+  bool IsSameOp(const OptimizedNodesGroup& group, const NodeDef& node) const {
+    return group.root_node->op() == node.op();
+  }
+
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
+    NodeDef* node;
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok()) return false;
+
+    if (!IsSameOp(group, *node)) {
+      return false;
+    }
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
+      return false;
+    }
+    if (IsDrivenByControlDependency(*node) || DrivesControlDependency(*node)) {
+      return false;
+    }
+    if (!IsOnTheSameDevice(group, *node)) {
+      return false;
+    }
+    // Optimized nodes updated in place, and that would break the graph, if the
+    // node has multiple output consumers
+    if (NumNonControlOutputs(*node, *ctx_.node_map) != 1) {
+      return false;
+    }
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(input, &properties);
+    return has_properties.ok() &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+  std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
+    std::set<string> sigs;
+    for (const auto& ias : inputs) {
+      sigs.insert(ShapeSignature(ias.shape));
+    }
+    return sigs.size();
+  }
+
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    if (CountUniqueShapes(group.inputs) <= 1) {
+      // nothing to optimize when all shapes are the same
+      return group.root_node->name();
+    }
+
+    auto num_nodes = /*root*/ 1 + group.optimized_nodes.size();
+    auto num_inputs = group.inputs.size();
+    CHECK_EQ(num_nodes, num_inputs - 1)
+        << "Can't build a tree with " << num_inputs << " inputs, using "
+        << num_nodes << "binary op nodes.";
+
+    std::deque<InputAndShape> add_ops(group.inputs.begin(), group.inputs.end());
+    std::deque<NodeDef*> optimized_nodes(group.optimized_nodes.begin(),
+                                         group.optimized_nodes.end());
+
+    // sort inputs by it's shape from smallest to largest
+    std::stable_sort(add_ops.begin(), add_ops.end(),
+                     [](const InputAndShape& lhs, const InputAndShape& rhs) {
+                       return CompareSymbolicallyShapedTensorSizes(lhs.shape,
+                                                                   rhs.shape);
+                     });
+
+    // If there is an odd number of inputs, last one is the largest, and we want
+    // to attach it to the root node, to build a well balanced tree.
+    std::deque<InputAndShape> add_ops_leftover;
+    if (add_ops.size() % 2 != 0) {
+      add_ops_leftover.push_back(add_ops.back());
+      add_ops.pop_back();
+    }
+
+    // At this point it's guaranteed that add_ops have even number of inputs.
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+
+      NodeDef* node;
+      if (!optimized_nodes.empty()) {
+        // re-purpose optimized nodes to build a new tree
+        node = optimized_nodes.front();
+        optimized_nodes.pop_front();
+      } else {
+        // or use root node if none optimized nodes left
+        node = group.root_node;
+      }
+      InputAndShape updated_node = UpdateInputs(lhs.input, rhs.input, node);
+
+      // Pushing updated node to the back of a deque will create a wide and
+      // short tree, pushing to the front will create a tall tree. We prefer to
+      // get a wide tree, it minimizes the potential number of temporary tensors
+      // required to keep in memory, though sometimes we can go up to prevent
+      // propagating a brodcast from leaves to the root. Example:
+      //
+      // inputs: [s, s, s, M] (s - scalar, M - matrix)
+      // @* - op with broadcast
+      //
+      //  (only push_back)           @*     (push_front first op)
+      //                            /  \
+      //       @*                  @    M
+      //     /   \                / \
+      //    @     @*      ->     @   s
+      //   / \   / \            / \
+      //  s   s s   M          s   s
+      if (add_ops.size() >= 2 &&
+          CompareSymbolicallyShapedTensorSizes(add_ops.at(0).shape,
+                                               add_ops.at(1).shape)) {
+        add_ops.push_front(updated_node);
+      } else {
+        add_ops.push_back(updated_node);
+      }
+    } while (add_ops.size() > 1);
+    CHECK_EQ(1, add_ops.size());
+
+    // attach the largest tensor to the root op
+    if (!add_ops_leftover.empty()) {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops_leftover.front();
+      InputAndShape updated_node =
+          UpdateInputs(lhs.input, rhs.input, group.root_node);
+      add_ops.push_back(updated_node);
+    }
+
+    return add_ops.front().input;
+  }
+
+  InputAndShape UpdateInputs(const string& input_0, const string& input_1,
+                             NodeDef* node) {
+    string old_input_0 = node->input(0);
+    string old_input_1 = node->input(1);
+
+    // Update inputs only if they changed
+    if (old_input_0 != input_0 || old_input_1 != input_1) {
+      node->set_input(0, input_0);
+      node->set_input(1, input_1);
+      // Invalidate node properties (shape)
+      ctx_.graph_properties->ClearOutputProperties(node->name());
+      ctx_.graph_properties->ClearInputProperties(node->name());
+      // Update the node map
+      ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_0), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_1), node->name());
+      // Add updated node to optimization queue
+      AddToOptimizationQueue(node);
+    }
+
+    // Do not add updated node to any other group
+    AddToOptimizedNodes(node);
+
+    TensorShapeProto shape;  // shape is not important at this point
+    return InputAndShape(node->name(), shape);
+  }
+};
+
 // Removes inverse transpose nodes
 class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
@@ -854,7 +1124,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   // TODO(rmlarsen): Forward control dependencies on the bypassed
   // transpose nodes.
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
@@ -943,7 +1213,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     // Bypass Bitcast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
@@ -981,7 +1251,8 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
   bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
     // Bypass Cast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
       *simplified_node_name = node->input(0);
@@ -1511,13 +1782,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
-    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-    new_square_node->set_op("Square");
-    for (int i = 1; i < new_square_node->input_size(); ++i) {
-      new_square_node->set_input(i - 1, new_square_node->input(i));
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    string dontcare;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      return new_square_node->name();
     }
-    new_square_node->mutable_input()->RemoveLast();
-    return new_square_node->name();
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
@@ -1678,6 +1958,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
   if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.minimize_broadcasts && can_use_shapes)
+    pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
   if (options_.remove_identity_transpose && can_use_shapes)
     pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
@@ -1749,12 +2031,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  GrapplerItem optimized_item(item);
-  optimized_graph_ = &optimized_item.graph;
-
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
+  *optimized_graph = item.graph;
+  optimized_graph_ = optimized_graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
   DedupComputations();
@@ -1763,8 +2044,9 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // optimize larger subgraphs starting from the roots with more inputs.
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
 
-  // Shapes are only needed in aggressive mode.
-  graph_properties_.reset(new GraphProperties(item));
+  GrapplerItem optimized_item(item, optimized_graph);
+  optimized_graph_ = &optimized_item.graph;
+  graph_properties_.reset(new GraphProperties(optimized_item));
   const Status status = graph_properties_->InferStatically(false);
   const bool can_use_shapes = status.ok();
   if (!can_use_shapes) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 39b89dedba3f03e69cb19351e700cea176813afc..c0fe8839ca7bd111eaf95cf742a8f1de841dac14 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -59,6 +59,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool enable_try_simplify_and_replace = true;
     bool combine_add_to_addn = false;
     bool hoist_common_factor_out_of_aggregation = true;
+    bool minimize_broadcasts = false;
     bool remove_identity_transpose = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
@@ -69,10 +70,10 @@ class ArithmeticOptimizer : public GraphOptimizer {
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      // TODO(ezhulenev): enable combine_add_to_addn by default after 1.8
-      // release cut
+      // TODO(ezhulenev): enable by default after 1.8 release cut
       if (opt_level == RewriterConfig::AGGRESSIVE) {
         options.combine_add_to_addn = true;
+        options.minimize_broadcasts = true;
       }
       return options;
     }
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e117341ba362ade4c23262477dbe2d95a4d78f6f..e63981285810a24fb6d97e09533290fb64dce05d 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -31,6 +31,9 @@ namespace grappler {
 
 namespace {
 
+constexpr char kHoistFactorOptimizerDiv[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Div_";
+
 constexpr char kHoistFactorOptimizerMul[] =
     "ArithmeticOptimizer/HoistCommonFactor_Mul_";
 
@@ -42,6 +45,11 @@ string HoistMulName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
 }
 
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+string HoistDivName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
+}
+
 // Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
 string HoistAddName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
@@ -93,6 +101,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.hoist_common_factor_out_of_aggregation = false;
+    options.minimize_broadcasts = false;
     options.remove_identity_transpose = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
@@ -113,6 +122,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
   }
 
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
   void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_identity_transpose = true;
@@ -552,7 +566,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+TEST_F(ArithmeticOptimizerTest, HoistFactorMul) {
   for (bool matching_shapes : {true, false}) {
     for (bool use_addn : {true, false}) {
       tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -619,6 +633,81 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      for (bool use_ints : {true, false}) {
+        tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+        Output x = use_ints
+                       ? ops::Const(s.WithOpName("x"), {1, 2}, {1, 2})
+                       : ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+        Output y1 = use_ints
+                        ? ops::Const(s.WithOpName("y1"), {3, 4}, {1, 2})
+                        : ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+        Output y2;
+        if (matching_shapes) {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5, 6}, {1, 2})
+                        : ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
+        } else {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5}, {1, 1})
+                        : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+        }
+        Output div1 = ops::Div(s.WithOpName("div1"), y1, x);
+        Output div2 = ops::Div(s.WithOpName("div2"), y2, x);
+        Output id =
+            use_addn
+                ? ops::Identity(s.WithOpName("id"),
+                                ops::AddN(s.WithOpName("add"), {div1, div2}))
+                : ops::Identity(s.WithOpName("id"),
+                                ops::Add(s.WithOpName("add"), div1, div2));
+
+        GrapplerItem item;
+        item.fetch = {"id"};
+        TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+        ArithmeticOptimizer optimizer;
+        EnableOnlyHoistCommonFactor(&optimizer);
+
+        GraphDef output;
+        OptimizeTwice(&optimizer, &item, &output);
+
+        // We expect the following rewrite(s) to occur:
+        //
+        //        Add                 Div
+        //      /    \               /   \
+        //    Div    Div       ->  Add    x
+        //    / \    / \           / \
+        //   y1  x  y2  x         y1  y2
+        //
+        // If "root" op is AddN and shapes does not match, this rewrite is not
+        // possible and graph should stay intact.
+        NodeMap node_map(&output);
+
+        if ((use_addn && !matching_shapes) || use_ints) {
+          VerifyGraphsMatch(item.graph, output, __LINE__);
+        } else {
+          EXPECT_EQ(9, output.node_size());
+
+          const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
+          ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+          EXPECT_EQ("y1", new_add_node->input(0));
+          EXPECT_EQ("y2", new_add_node->input(1));
+
+          const NodeDef* new_div_node = node_map.GetNode(HoistDivName("add"));
+          ASSERT_TRUE(new_div_node != nullptr) << "Hoisted Div node not found";
+          EXPECT_EQ(new_add_node->name(), new_div_node->input(0));
+          EXPECT_EQ("x", new_div_node->input(1));
+
+          const NodeDef* id_node = node_map.GetNode("id");
+          ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+          EXPECT_EQ("id", id_node->name());
+          EXPECT_EQ(HoistDivName("add"), id_node->input(0));
+        }
+      }
+    }
+  }
+}
+
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
@@ -1841,5 +1930,160 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   EXPECT_EQ(5, found);
 }
 
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul2);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     *                  *
+  //    / \                / \
+  //   *   c      -->     *   b
+  //  / \                / \
+  // a   b              a   c
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("b", mul2_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul2, d);
+  auto mul4 = ops::Mul(s.WithOpName("mul4"), mul3, e);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul4);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: Graph is "flattened" and
+  // largest shape pushed to the top.
+  //
+  //          *
+  //        /   \
+  //       *     e                *
+  //      /  \                  /   \
+  //     *    d               *      b
+  //    / \                 /  \
+  //   *   c      -->     *      *
+  //  / \                / \    / \
+  // a   b              a   c  d   e
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("d", mul2_node->input(0));
+  EXPECT_EQ("e", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("mul1", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+
+  const NodeDef* mul4_node = node_map.GetNode("mul4");
+  ASSERT_NE(mul4_node, nullptr);
+  EXPECT_EQ("mul3", mul4_node->input(0));
+  EXPECT_EQ("b", mul4_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // [a, b, c] - scalars, [d] - matrix
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("D"), {32, 32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), c, d);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul1, mul2);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul3);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //                              *
+  //                            /  \
+  //       *                   *    D
+  //     /   \                / \
+  //    *     *      ->      *   c
+  //   / \   / \            / \
+  //  a   b c   D          a   b
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("b", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("c", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("D", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b2a1ce6ab6746d02f729f380ff9ca3fbdea8e781..e29aaa25fe3797e994d13239ceff8478ca779997 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1004,7 +1004,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
 
   for (const auto& input : node.input()) {
     int port = 0;
-    ParseNodeName(input, &port);
+    ParseNodeNameAsStringPiece(input, &port);
     if (port < 0) {
       // Control dependency
       break;
@@ -2084,9 +2084,9 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
           left_child_is_constant ? left_child : right_child;
       // Make sure that it is safe to change the value of the child node->
       if (op_child_node->input_size() < 2 ||
-          NumNonControlOutputs(*op_child_node, *node_map_) > 1 ||
           nodes_to_preserve_.find(op_child_node->name()) !=
-              nodes_to_preserve_.end()) {
+              nodes_to_preserve_.end() ||
+          NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
         continue;
       }
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 08c92687e34f45ace4be14d7c3e7deb2eb8551e3..36625b68b776f89c10a065ff2c031b287e94ab76 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -933,6 +933,17 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
     }
   }
   EXPECT_EQ(1, found);
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>({5, 7});
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>({11, 13});
+
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
+  EXPECT_EQ(1, item.fetch.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
@@ -1095,6 +1106,17 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch, {{"v1", v1_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
 }
 
 TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
@@ -1234,6 +1256,18 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
     }
   }
   EXPECT_EQ(2, found);
+
+  auto v_in_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
+  Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
+  v_ctrl_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
+  EXPECT_EQ(2, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, MergeNodes) {
@@ -1374,6 +1408,16 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SplitVRemoval) {
@@ -1416,6 +1460,16 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
@@ -1450,6 +1504,17 @@ TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   AddNode("out2", "Identity", {"s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(2, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
@@ -1486,6 +1551,16 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
   {  // size = {-1, -1}
     tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
@@ -1524,6 +1599,16 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
     CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
 }
 
@@ -1602,6 +1687,16 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   AddNode("out", "Add", {"p1", "p2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({4, 6}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 2}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
@@ -1632,6 +1727,16 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
+
+  auto in1_t = GenerateRandomTensor<DT_INT32>(TensorShape({2, 3}));
+  auto in2_t = GenerateRandomTensor<DT_INT32>(TensorShape({1, 2, 3, 1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReduction) {
@@ -1666,6 +1771,13 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
     }
   }
   EXPECT_TRUE(found);
+
+  auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReshape) {
@@ -1744,6 +1856,21 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
     }
   }
   EXPECT_EQ(4, found);
+
+  auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17}));
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({17, 1}));
+  auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto v4_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 5, 5}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors =
+      EvaluateNodes(output, item.fetch,
+                    {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}, {"v4", v4_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++)
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, Packing) {
@@ -1925,6 +2052,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("reshape");
 
+  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  Tensor indices_t(DT_INT32, TensorShape({2}));
+  indices_t.flat<int>()(0) = 0;
+  indices_t.flat<int>()(1) = 1;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -1951,6 +2086,11 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, LargeConstant) {
@@ -2047,6 +2187,23 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  // Evaluate id_true when input tensor x is true.
+  Tensor x_t(DT_BOOL, TensorShape({}));
+  x_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(item.graph, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
+
+  // Evalute id_false when input tensor is false.
+  x_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  tensors = EvaluateNodes(output, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
@@ -2288,6 +2445,15 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
       EXPECT_EQ("^id_n", node.input(0));
     }
   }
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
 }
 
 TEST_F(ConstantFoldingTest, TrivialPack) {
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index 8bd10171f15f80d3bb85d63c8f62067992c4f37e..9701a038d0287db7745b9181b429bd81b1cdd854 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -40,10 +41,22 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
           inp = AsControlDependency(inp);
         }
       }
-    } else if (IsCheckNumerics(node)) {
+    } else if (IsCheckNumerics(node) || IsPrint(node)) {
       // Replace with Identity op which will be pruned later.
       node.set_op("Identity");
-      node.mutable_attr()->erase("message");
+      // Only preserve T attribute.
+      protobuf::Map<string, AttrValue> new_attr;
+      if (node.attr().find("T") != node.attr().end()) {
+        new_attr.insert({"T", node.attr().at("T")});
+      }
+      node.mutable_attr()->swap(new_attr);
+      // As Identity op only takes one input, mark redundant inputs as control
+      // input.
+      for (size_t i = 1; i < node.input_size(); ++i) {
+        if (!IsControlInput(node.input(i))) {
+          *node.mutable_input(i) = AsControlDependency(node.input(i));
+        }
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index 3f11febc64dbd55aaaebcdf8d1763517a966264b..96ceee791f8f15c3cba1d8a6a5ae5e1f1106597c 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -164,6 +164,42 @@ TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
+TEST_F(DebugStripperTest, StripPrintFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output print = ops::Print(s.WithOpName("Print"), x, {x});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Print") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ(1, node.attr_size());
+    }
+  }
+
+  EXPECT_EQ(2, output.node_size());
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"Print"}, {{"x", x_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"Print"}, {{"x", x_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index ed9bce439c6d6a5c09e3af53718fa49e191549ab..7b7fd8115588a6dceb4a74d502e2883a84f57199 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -109,23 +109,12 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
 }
 
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
-  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
-    return false;
-  }
-  if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
-    // The output values of this node may be needed.
-    return false;
-  }
-  if (IsMerge(node) || IsSwitch(node)) {
-    return false;
-  }
-  if (ModifiesFrameInfo(node)) {
-    return false;
-  }
-  if (!IsFreeOfSideEffect(node)) {
+  if (!fetch_nodes_known_ ||
+      nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  if (node.op() == "ControlTrigger") {
+  if (IsMerge(node) || IsSwitch(node) || ModifiesFrameInfo(node) ||
+      !IsFreeOfSideEffect(node)) {
     return false;
   }
   if (node.op().rfind("Submodel", 0) == 0) {
@@ -136,16 +125,21 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (!status.ok() || op_def->output_arg_size() == 0) {
     return false;
   }
-
+  const std::unordered_set<string> do_not_rewrite_ops{
+      "Assert",      "CheckNumerics",         "_Retval",
+      "_Arg",        "_ParallelConcatUpdate", "_TPUExecute",
+      "_TPUCompile", "ControlTrigger"};
+  if (do_not_rewrite_ops.find(node.op()) != do_not_rewrite_ops.end()) {
+    return false;
+  }
   if (!SafeToRemoveIdentity(node)) {
     return false;
   }
-
-  const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert",     "CheckNumerics",         "_Retval",
-      "_Arg",       "_ParallelConcatUpdate", "_TPUExecute",
-      "_TPUCompile"};
-  return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
+  if (NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  return true;
 }
 
 void DependencyOptimizer::OptimizeNode(int node_idx,
@@ -164,7 +158,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
-        string input_name = ParseNodeName(fanout->input(i), &pos);
+        StringPiece input_name =
+            ParseNodeNameAsStringPiece(fanout->input(i), &pos);
         if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
@@ -358,8 +353,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           for (int j = 0; j < consumer->input_size(); ++j) {
             const string& old_input = consumer->input(j);
             int old_input_pos;
-            string old_input_node_name =
-                ParseNodeName(old_input, &old_input_pos);
+            StringPiece old_input_node_name =
+                ParseNodeNameAsStringPiece(old_input, &old_input_pos);
             if (old_input_node_name == node_name) {
               if (old_input_pos >= 0) {
                 // Regular input
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 343c89a9da8fc32a985dc36b23280f097ba3f48e..6d67ead3550013ffd468c7b8113d43e5c214f0d2 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -38,11 +38,14 @@ class FunctionInliningContext {
  public:
   explicit FunctionInliningContext(const GrapplerItem& item,
                                    RewriterConfig::Toggle opt_level)
-      : library_(&item.graph.library()),
-        opt_level_(opt_level),
-        functions_(InliningCandidates(item)) {}
+      : opt_level_(opt_level),
+        functions_(InliningCandidates(item)),
+        function_library_(FunctionLibraryDefinition(OpRegistry::Global(),
+                                                    item.graph.library())) {}
 
-  const FunctionDefLibrary& Library() const { return *library_; }
+  const FunctionLibraryDefinition& FunctionLibrary() const {
+    return function_library_;
+  }
 
   bool HasInlinedFunctions() const { return !functions_.empty(); }
 
@@ -78,9 +81,9 @@ class FunctionInliningContext {
     return functions;
   }
 
-  const FunctionDefLibrary* library_;
   RewriterConfig::Toggle opt_level_;
   std::unordered_map<string, const FunctionDef*> functions_;
+  FunctionLibraryDefinition function_library_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
 };
@@ -150,11 +153,14 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
-  if (!item) {
+  GrapplerFunctionItem item;
+  Status item_status =
+      MakeGrapplerFunctionItem(func, func_attr, ctx.FunctionLibrary(), &item);
+
+  if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
-                                   " instantiated by ", func_node.name());
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
   }
 
   std::unordered_map<string, int> input_nodes;
@@ -168,7 +174,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
   TF_RETURN_IF_ERROR(
       HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
 
-  for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+  for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
     if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
       CHECK_EQ(0, func_body_node.input_size());
       // Turn input placeholders into identity nodes
@@ -217,8 +223,9 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
 
   // Hook inlined function outputs to IdentityN node
   NodeDef* func_outputs = optimized_graph->add_node();
+  std::vector<string> fetch = OutputTensors(item);
   TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
-                                                item->fetch, func_outputs));
+                                                fetch, func_outputs));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fe26a56fc214d65a617310b158e8ead55b37469f..099fe7caf25afdf17fd9c7705d617bbc5d17b7b6 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -92,13 +92,13 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/scale:0", node.input(1));
+      EXPECT_EQ("y/scale", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y:0", node.input(0));
+      EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -180,13 +180,13 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("y/x", node.input(0));
-      EXPECT_EQ("y/two:0", node.input(1));
+      EXPECT_EQ("y/two", node.input(1));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/y:0", node.input(0));
+      EXPECT_EQ("y/y", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -264,13 +264,13 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Linear_func:0", node.input(0));
+      EXPECT_EQ("y/Linear_func", node.input(0));
     } else if (node.name() == "y") {
       count++;
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(device, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("y/Exp:0", node.input(0));
+      EXPECT_EQ("y/Exp", node.input(0));
     } else if (node.name() == "z") {
       count++;
       EXPECT_EQ("Identity", node.op());
@@ -453,12 +453,12 @@ TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output/output:0", node.input(0));
+      EXPECT_EQ("square/output/output", node.input(0));
     } else if (node.name() == "square" && count++) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square/output:0", node.input(0));
+      EXPECT_EQ("square/output", node.input(0));
     } else if (node.name() == "outputs" && count++) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 7ed047486111992dd61cc116319da91f0f93ac64..072f7729466ddcee68dc2c9ca0b27ee4b97b18c7 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -134,6 +134,18 @@ class GraphOptimizerStage {
   // and remove template parameter.
   virtual Status TrySimplify(NodeDef* node, Result* result) = 0;
 
+  // Return InvalidArgumentError if node is not supported by the optimizer
+  // stage.
+  // TODO(ezhulenev): make this check part of non-virtual public API
+  // (TrySimplify), and make virtual implementation protected.
+  Status EnsureNodeIsSupported(const NodeDef* node) const {
+    return IsSupported(node)
+               ? Status::OK()
+               : errors::InvalidArgument(
+                     "Node ", node->name(), " is not supported by optimizer ",
+                     optimizer_name_, " and stage ", stage_name_);
+  }
+
   // Get a name for a new node, created by this stage, based on one or multiple
   // nodes of an original graph.
   const string OptimizedNodeName(const NodeScopeAndName& node) const {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 308eecd4205d0d6efd6aecc9f8ca18e958342faa..8fb30d116de8f2784a7f58baea6c77d4c76e0ae9 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -17,9 +17,13 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -363,6 +367,28 @@ std::vector<int> DataInputPos(const NodeDef& node) {
   return {};
 }
 
+bool IsHostMemory(const NodeDef& node, int output_port) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+    DeviceType device_type(parsed_name.type);
+    Status s = FindKernelDef(device_type, node, nullptr, nullptr);
+    if (s.ok()) {
+      tensorflow::MemoryTypeVector in_mtypes;
+      tensorflow::MemoryTypeVector out_mtypes;
+      s = tensorflow::MemoryTypesForNode(OpRegistry::Global(), device_type,
+                                         node, &in_mtypes, &out_mtypes);
+      if (s.ok()) {
+        if (out_mtypes[output_port] == HOST_MEMORY) {
+          return true;
+        }
+      }
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
 class GraphProcessor {
  public:
   GraphProcessor(const GraphProperties& graph_properties,
@@ -883,6 +909,24 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
+  string MaybeGetHostDevice(const string& input_name) const {
+    string device = node_->device();
+    DeviceNameUtils::ParsedName parsed_name;
+    if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
+      if (parsed_name.type != "CPU") {
+        NodeDef* input = node_map_->GetNode(input_name);
+        int port;
+        ParseNodeName(input_name, &port);
+        if (IsHostMemory(*input, port)) {
+          parsed_name.type = "CPU";
+          parsed_name.id = 0;
+          device = DeviceNameUtils::ParsedNameToString(parsed_name);
+        }
+      }
+    }
+    return device;
+  }
+
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
                                const string& op, DataType dtype,
                                bool nhwc_to_nchw) {
@@ -890,7 +934,9 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_name(name);
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
-    added_node->set_device(node_->device());
+    // The inputs of a DataFormat op could be in host memory for ops such as
+    // Reshape.
+    added_node->set_device(MaybeGetHostDevice(input_name));
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 1c912fcaa251c576308a983ef351319053423a85..e405c4c58c93fdf8d898fa660d6fe3cee9f30e35 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -17,10 +17,15 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -29,15 +34,25 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class LayoutOptimizerTest : public ::testing::Test {
+class LayoutOptimizerTest : public GrapplerTest {
  protected:
   void SetUp() override {
-    DeviceProperties device_properties;
-    device_properties.set_type("GPU");
-    device_properties.mutable_environment()->insert({"architecture", "6"});
-    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+    gpu_available_ = GetNumAvailableGPUs() > 0;
+
+    if (gpu_available_) {
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+    } else {
+      DeviceProperties device_properties;
+      device_properties.set_type("GPU");
+      device_properties.mutable_environment()->insert({"architecture", "6"});
+      virtual_cluster_.reset(
+          new VirtualCluster({{"/GPU:1", device_properties}}));
+    }
+    TF_CHECK_OK(virtual_cluster_->Provision());
   }
 
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
     return SimpleConv2D(s, input_size, filter_size, padding, "");
@@ -158,7 +173,8 @@ class LayoutOptimizerTest : public ::testing::Test {
     return output.x_backprop;
   }
 
-  std::unique_ptr<VirtualCluster> virtual_cluster_;
+  std::unique_ptr<Cluster> virtual_cluster_;
+  bool gpu_available_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -182,6 +198,15 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   Tensor input_sizes_expected(DT_INT32, {4});
   test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+
+  if (gpu_available_) {
+    std::vector<string> fetch = {"Fetch"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  }
 }
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
@@ -1130,6 +1155,27 @@ TEST_F(LayoutOptimizerTest, LoopNoLiveLock) {
   EXPECT_EQ(mul_node->input(0),
             "Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
 }
+
+TEST_F(LayoutOptimizerTest, DevicePlacement) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shape = ops::Shape(s.WithOpName("s"), conv);
+  auto i = ops::Identity(s.WithOpName("i"), shape);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  VirtualPlacer virtual_placer(virtual_cluster_.get());
+  for (auto& node : *item.graph.mutable_node()) {
+    string device = virtual_placer.get_canonical_device_name(node);
+    node.set_device(device);
+  }
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto vec_permute =
+      node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_TRUE(str_util::EndsWith(vec_permute->device(), "CPU:0"));
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index a063dc33816e25c560a385e188203c9ad9bfe4cd..fff06dd2acefe424b75b21904fa21494af020be8 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -16,18 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <deque>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -46,74 +45,36 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
-    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
-  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
-  const std::unordered_set<string> op_types_to_traverse(
-      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
-       "Identity", "RefIdentity"});
-  std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
-    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
-    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
-    if (IsStackPushOp(fanout_node)) {
-      nodes_to_convert.push_back(fanout_idx);
-    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
-               op_types_to_traverse.find(fanout_node.op()) !=
-                   op_types_to_traverse.end()) {
-      continue;
-    } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
-                nodes_to_preserve.find(fanout_node.name()) !=
-                    nodes_to_preserve.end())) {
-      // The node is either a stack pop with consumers or something unexpected
-      // so we leave the graph alone.
-      nodes_to_convert.clear();
-      break;
-    }
-  }
-  return nodes_to_convert;
-}
+class LoopInvariantNodeMotionOptimizer {
+ public:
+  explicit LoopInvariantNodeMotionOptimizer(GraphDef* optimized_graph)
+      : optimized_graph_(optimized_graph) {}
+  virtual ~LoopInvariantNodeMotionOptimizer() = default;
+  Status Optimize();
 
-Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
-  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
-  const GraphDef& graph = item.graph;
-  *optimized_graph = graph;
-  NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
-  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
-    if (IsStackOp(graph.node(node_idx))) {
-      for (int push_node_idx : GetStackPushNodesToConvert(
-               graph_view, nodes_to_preserve, node_idx)) {
-        // We found push nodes without corresponding pops. Convert them to
-        // Identity passing the data through and add a control dependency from
-        // the op supplying the stack handle.
-        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
-        VLOG(1) << "Converting " << push_node_idx << " : "
-                << push_node->DebugString();
-        if (push_node->attr().count("swap_memory") != 0) {
-          push_node->mutable_attr()->erase("swap_memory");
-        }
-        push_node->set_op("Identity");
-        push_node->mutable_input()->SwapElements(0, 1);
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            push_node->input(1), optimized_graph, &node_map);
-        push_node->set_input(1, ctrl_dep);
-        VLOG(1) << "After converting: " << push_node->DebugString();
-      }
-    }
-  }
-  return Status::OK();
-}
+ private:
+  Status FindInvariantNodes(NodeDef* node);
+  Status RevertInvariantNodes();
+  Status MoveInvariantNodes(const int frame_id);
+  Status HandleInvariantNode(NodeDef* node, const int num_outputs,
+                             const int frame_id);
+  Status HandleConst(NodeDef* node, const int num_outputs, const int frame_id);
+  Status HandleInvariantEnter(NodeDef* node, const int num_outputs);
 
-}  // namespace
+  GraphDef* optimized_graph_;  // Not owned.
+  std::unique_ptr<NodeMap> node_map_;
+  std::map<NodeDef*, int> invariant_nodes_;
+  std::set<int> empty_set_;
+  // TODO(rmlarsen): Use vector instead of map, since frames ids are dense.
+  std::map<int, std::set<int>> frame_children_;
+  std::map<int, int> frame_parent_;
+  std::map<int, const NodeDef*> loop_cond_;
+  std::map<int, std::vector<NodeDef*>> invariant_enters_;
+  int new_enter_id_;
+};
 
-Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
-                                               const int num_outputs) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
+    NodeDef* node, const int num_outputs) {
   auto consumers = node_map_->GetOutputs(node->name());
   std::vector<string> enter_control_inputs;
   string enter_input;
@@ -142,9 +103,10 @@ Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleConst(NodeDef* node,
-    const int num_outputs, const int frame_id) {
-  NodeDef* const_node;
+Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
+                                                     const int num_outputs,
+                                                     const int frame_id) {
+  NodeDef* const_node = nullptr;
   if (num_outputs == 0) {
     // all successor nodes are invariant
     // Remove the control inputs from this frame to the const node,
@@ -156,12 +118,17 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
     // some successor nodes are variant
     // Have to keep the const node in the frame,
     // so create a new one outside the frame (in parent frame)
-    const_node = optimized_graph_->add_node();
-    const_node->set_name(AddPrefixToNodeName(node->name(), kLoopOptimizer));
-    const_node->set_op("Const");
-    const_node->set_device(node->device());
-    *const_node->mutable_attr() = node->attr();
-    node_map_->AddNode(const_node->name(), const_node);
+    const string const_node_name =
+        AddPrefixToNodeName(node->name(), kLoopOptimizer);
+    const_node = node_map_->GetNode(const_node_name);
+    if (const_node == nullptr) {
+      const_node = optimized_graph_->add_node();
+      const_node->set_name(const_node_name);
+      const_node->set_op("Const");
+      const_node->set_device(node->device());
+      *const_node->mutable_attr() = node->attr();
+      node_map_->AddNode(const_node->name(), const_node);
+    }
     auto consumers = node_map_->GetOutputs(node->name());
     for (auto* consumer : consumers) {
       if (invariant_nodes_.count(consumer)) {
@@ -185,8 +152,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
     int parent_id = parent_it->second;
     auto loop_cond_it = loop_cond_.find(parent_id);
     if (loop_cond_it == loop_cond_.end()) {
-      return errors::InvalidArgument(
-          "Frame ", frame_id, " doesn't have a LoopCond node");
+      return errors::InvalidArgument("Frame ", frame_id,
+                                     " doesn't have a LoopCond node");
     }
     auto& loop_cond_name = loop_cond_it->second->name();
     NodeDef* switch_node = nullptr;
@@ -197,9 +164,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
       }
     }
     if (!switch_node) {
-      return errors::InvalidArgument(
-          "LoopCond node of Frame ", frame_id,
-          " doesn't connect to any Switch node");
+      return errors::InvalidArgument("LoopCond node of Frame ", frame_id,
+                                     " doesn't connect to any Switch node");
     }
     string switch_output = StrCat(switch_node->name(), ":1");
     const string ctrl_dep = ConstantFolding::AddControlDependency(
@@ -210,8 +176,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
-    const int num_outputs, const int frame_id) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
+    NodeDef* node, const int num_outputs, const int frame_id) {
   // have to remove control inputs to the invariant node from the same frame
   // when moving this node out of this frame
   for (int i = 0; i < node->input_size(); ++i) {
@@ -228,16 +194,14 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   DataTypeVector output_types;
   OpRegistryInterface* op_registry = OpRegistry::Global();
   const OpRegistrationData* op_reg_data = nullptr;
-  TF_RETURN_IF_ERROR(
-      op_registry->LookUp(node->op(), &op_reg_data));
-  TF_RETURN_IF_ERROR(
-      InOutTypesForNode(*node, op_reg_data->op_def,
-                        &input_types, &output_types));
+  TF_RETURN_IF_ERROR(op_registry->LookUp(node->op(), &op_reg_data));
+  TF_RETURN_IF_ERROR(InOutTypesForNode(*node, op_reg_data->op_def, &input_types,
+                                       &output_types));
 
   auto consumers = node_map_->GetOutputs(node->name());
   string fname = invariant_enters_[frame_id][0]->attr().at("frame_name").s();
-  int piterations = invariant_enters_[frame_id][0]
-                    ->attr().at("parallel_iterations").i();
+  int piterations =
+      invariant_enters_[frame_id][0]->attr().at("parallel_iterations").i();
   for (auto* consumer : consumers) {
     if (!invariant_nodes_.count(consumer)) {
       for (int i = 0; i < consumer->input_size(); ++i) {
@@ -281,28 +245,27 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::MoveInvariantNodes(const int frame_id) {
-  for (auto iter = invariant_nodes_.begin();
-       iter != invariant_nodes_.end(); ++iter) {
+Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
+    const int frame_id) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();
+       ++iter) {
     auto* invariant_node = iter->first;
     const int num_outputs = iter->second;
     if (IsEnter(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleInvariantEnter(invariant_node, num_outputs));
+      TF_RETURN_IF_ERROR(HandleInvariantEnter(invariant_node, num_outputs));
     } else if (IsConstant(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleConst(invariant_node, num_outputs, frame_id));
+      TF_RETURN_IF_ERROR(HandleConst(invariant_node, num_outputs, frame_id));
     } else {
       TF_RETURN_IF_ERROR(
-          LINMHandleInvariantNode(invariant_node, num_outputs, frame_id));
+          HandleInvariantNode(invariant_node, num_outputs, frame_id));
     }
   }
   return Status::OK();
 }
 
-Status LoopOptimizer::RevertInvariantNodes() {
+Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
   std::deque<const NodeDef*> reverted_nodes;
-  for (auto iter=invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
     bool erased = false;
     const auto* node = iter->first;
     if (!IsConstant(*node) && !IsEnter(*node) && iter->second > 0) {
@@ -331,8 +294,8 @@ Status LoopOptimizer::RevertInvariantNodes() {
       auto* producer = node_map_->GetNode(input);
       auto iter = invariant_nodes_.find(producer);
       if (iter != invariant_nodes_.end()) {
-        if (IsControlInput(input) &&
-            !IsConstant(*producer) && !IsEnter(*producer)) {
+        if (IsControlInput(input) && !IsConstant(*producer) &&
+            !IsEnter(*producer)) {
           reverted_nodes.push_back(producer);
           invariant_nodes_.erase(iter);
         } else {
@@ -357,12 +320,11 @@ Status LoopOptimizer::RevertInvariantNodes() {
   return Status::OK();
 }
 
-Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
+Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(NodeDef* node) {
   auto consumers = node_map_->GetOutputs(node->name());
   invariant_nodes_.insert(std::make_pair(node, consumers.size()));
   for (auto* consumer : consumers) {
-    if (invariant_nodes_.count(consumer) ||
-        ModifiesFrameInfo(*consumer)) {
+    if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
       continue;
     }
     bool is_invariant = true;
@@ -399,9 +361,14 @@ Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
   return Status::OK();
 }
 
-Status LoopOptimizer::LoopInvariantNodeMotion() {
+Status LoopInvariantNodeMotionOptimizer::Optimize() {
+  node_map_.reset(new NodeMap(optimized_graph_));
+  FrameMap frame_map;
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map, &num_frames));
   std::deque<int> worklist;
-  for (auto iter = frame_map_.begin(); iter != frame_map_.end(); ++iter) {
+  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
     auto* node = iter->first;
     auto& frame_ids = iter->second;
     if (frame_ids.size() >= 3) {
@@ -467,19 +434,82 @@ Status LoopOptimizer::LoopInvariantNodeMotion() {
   return Status::OK();
 }
 
-Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
+std::vector<int> GetStackPushNodesToConvert(
+    const SimpleGraphView& graph_view,
+    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
+  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+  const std::unordered_set<string> op_types_to_traverse(
+      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
+       "Identity", "RefIdentity"});
+  std::vector<int> nodes_to_convert;
+  std::set<int> fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
+  for (int fanout_idx : fanout) {
+    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
+    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
+    if (IsStackPushOp(fanout_node)) {
+      nodes_to_convert.push_back(fanout_idx);
+    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
+               op_types_to_traverse.find(fanout_node.op()) !=
+                   op_types_to_traverse.end()) {
+      continue;
+    } else if (!IsStackPopOp(fanout_node) ||
+               (!graph_view.outputs(fanout_idx).empty() ||
+                nodes_to_preserve.find(fanout_node.name()) !=
+                    nodes_to_preserve.end())) {
+      // The node is either a stack pop with consumers or something unexpected
+      // so we leave the graph alone.
+      nodes_to_convert.clear();
+      break;
+    }
+  }
+  return nodes_to_convert;
+}
+
+Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const GraphDef& graph = item.graph;
+  *optimized_graph = graph;
+  NodeMap node_map(optimized_graph);
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
+  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
+    if (IsStackOp(graph.node(node_idx))) {
+      for (int push_node_idx : GetStackPushNodesToConvert(
+               graph_view, nodes_to_preserve, node_idx)) {
+        // We found push nodes without corresponding pops. Convert them to
+        // Identity passing the data through and add a control dependency from
+        // the op supplying the stack handle.
+        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
+        VLOG(1) << "Converting " << push_node_idx << " : "
+                << push_node->DebugString();
+        if (push_node->attr().count("swap_memory") != 0) {
+          push_node->mutable_attr()->erase("swap_memory");
+        }
+        push_node->set_op("Identity");
+        push_node->mutable_input()->SwapElements(0, 1);
+        const string ctrl_dep = ConstantFolding::AddControlDependency(
+            push_node->input(1), optimized_graph, &node_map);
+        push_node->set_input(1, ctrl_dep);
+        VLOG(1) << "After converting: " << push_node->DebugString();
+      }
+    }
+  }
+  return Status::OK();
+}
 
-  TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+}  // namespace
 
-  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
-    optimized_graph_ = optimized_graph;
-    // Set up helper data structures.
-    node_map_.reset(new NodeMap(optimized_graph_));
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                                 &frame_map_, &num_frames));
-    TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
+Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+  // Set up helper data structures.
+  if (options_.enable_loop_invariant_node_motion) {
+    LoopInvariantNodeMotionOptimizer linm_optimizer(optimized_graph);
+    TF_RETURN_IF_ERROR(linm_optimizer.Optimize());
+  }
+  if (options_.enable_stack_push_removal) {
+    TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index c1b0321e4e16f2c34a8016fe51068a79634a9617..a422505d23c197a6fd677c97e326b529a4bd57b2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -30,9 +30,13 @@ constexpr char kLoopOptimizer[] = "LoopOptimizer";
 
 class LoopOptimizer : public GraphOptimizer {
  public:
-  LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
+  LoopOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
   explicit LoopOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : opt_level_(opt_level),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
+
   ~LoopOptimizer() override {}
 
   string name() const override { return "loop_optimizer"; };
@@ -44,29 +48,21 @@ class LoopOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  Status LoopInvariantNodeMotion();
-  Status FindInvariantNodes(NodeDef* node);
-  Status RevertInvariantNodes();
-  Status MoveInvariantNodes(const int frame_id);
-  Status LINMHandleInvariantNode(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleConst(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleInvariantEnter(NodeDef* node, const int num_outputs);
-
-  std::map<NodeDef*, int> invariant_nodes_;
-  std::set<int> empty_set_;
-  std::map<int, std::set<int>> frame_children_;
-  std::map<int, int> frame_parent_;
-  std::map<int, const NodeDef*> loop_cond_;
-  std::map<int, std::vector<NodeDef*>> invariant_enters_;
-  int new_enter_id_;
-  RewriterConfig::Toggle opt_level_;
+  friend class LoopOptimizerTest;
+
+  // Granular control for loop optimizer stages.
+  struct LoopOptimizerOptions {
+    bool enable_loop_invariant_node_motion = true;
+    bool enable_stack_push_removal = true;
 
-  std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
-  std::unique_ptr<GraphProperties> graph_properties_;
-  GraphDef* optimized_graph_;  // Not owned.
+    static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
+      LoopOptimizerOptions options;
+      return options;
+    }
+  };
+
+  RewriterConfig::Toggle opt_level_;
+  LoopOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index a0bd3351976ccbeddd8778281dbdc0c07bbd6455..10ec544424e651a1c0d39ef6af9a8f824de6c99e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class LoopOptimizerTest : public GrapplerTest {
  protected:
@@ -57,6 +56,23 @@ class LoopOptimizerTest : public GrapplerTest {
     attributes.emplace_back("T", type);
     AddNode(name, op, inputs, attributes, graph);
   }
+
+  void DisableAllStages(LoopOptimizer* optimizer) {
+    LoopOptimizer::LoopOptimizerOptions options;
+    options.enable_loop_invariant_node_motion = false;
+    options.enable_stack_push_removal = false;
+    optimizer->options_ = options;
+  }
+
+  void EnableOnlyLoopInvariantNodeMotion(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_loop_invariant_node_motion = true;
+  }
+
+  void EnableOnlyStackPushRemoval(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_stack_push_removal = true;
+  }
 };
 
 TEST_F(LoopOptimizerTest, Basic) {
@@ -81,7 +97,8 @@ TEST_F(LoopOptimizerTest, Basic) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -128,7 +145,8 @@ TEST_F(LoopOptimizerTest, Const) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -175,7 +193,8 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -235,7 +254,8 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -302,7 +322,8 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -365,7 +386,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -429,7 +451,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -475,6 +498,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   CHECK(fake_input.NextItem(&item));
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -504,6 +528,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   AddSimpleNode("stop", "StopGradient", {"stack3"}, &graph);
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -534,6 +559,7 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   item.fetch.push_back("pop4");
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -563,6 +589,5 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   }
 }
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 27e9d2c78d0456e61d31f7f772172fb8d17a11ac..c1fee0e993dd18578fec561b5bfc2b7b8987d31f 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -1227,7 +1227,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                              recomputation_targets_name_scope_, optimized_graph,
                              item);
 
-  GrapplerItem optimized_item(item, std::move(*optimized_graph));
+  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 5723e397abe2348bec82fb939ea8bfca1df72eb7..558b8a77e8aefde860f9394b408d7f452659e59c 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -178,45 +178,41 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
           ? 1
           : cfg_.meta_optimizer_iterations();
+  GrapplerItem optimized_item = item;
+  optimized_graph->Swap(&optimized_item.graph);
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     VLOG(1) << "Starting optimization iteration " << iteration + 1;
     for (const auto& optimizer : optimizers) {
+      // Invariant: optimized_graph contains the most recently optimized
+      // version of the graph.
       if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
         continue;
       }
-      if (!already_optimized) {
-        Status status = optimizer->Optimize(cluster, item, optimized_graph);
-        string result;
-        if (!status.ok()) {
-          VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                  << ". Return status: " << status.ToString();
-          result = status.ToString();
-        } else {
-          already_optimized = true;
-          result = strings::StrCat(
-              "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
-        }
-        result_.push_back(std::make_pair(optimizer->name(), result));
-        VLOG(1) << "Optimizer " << optimizer->name()
-                << " return status: " << result;
+      uint64 start_us = Env::Default()->NowMicros();
+      // This swaps the current optimized_graph into optimized item and
+      // resets optimized_graph to an empty graph.
+      optimized_graph->Swap(&optimized_item.graph);
+      *optimized_graph = GraphDef();
+      Status status =
+          optimizer->Optimize(cluster, optimized_item, optimized_graph);
+
+      uint64 end_us = Env::Default()->NowMicros();
+      float duration_ms = (end_us - start_us) / 1000.0f;
+      string result;
+      if (!status.ok()) {
+        VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                << status.ToString();
+        optimized_graph->Swap(&optimized_item.graph);
+        result = status.ToString();
       } else {
-        GrapplerItem optimized_item(item, std::move(*optimized_graph));
-        Status status =
-            optimizer->Optimize(cluster, optimized_item, optimized_graph);
-        string result;
-        if (!status.ok()) {
-          VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
-                  << status.ToString();
-          optimized_graph->Swap(&optimized_item.graph);
-          result = status.ToString();
-        } else {
-          result = strings::StrCat(
-              optimizer->name(), ": ",
-              PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
-        }
-        result_.push_back(std::make_pair(optimizer->name(), result));
-        VLOG(1) << result;
+        already_optimized = true;
+        result = strings::StrCat(
+            optimizer->name(), ": ",
+            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
+            ", time = ", duration_ms, "ms.");
       }
+      result_.emplace_back(optimizer->name(), result);
+      VLOG(1) << result;
     }
   }
 
@@ -230,10 +226,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
               item.graph.library().gradient_size());
     DCHECK_EQ(optimized_graph->versions().producer(),
               item.graph.versions().producer());
-  } else {
-    *optimized_graph = item.graph;
   }
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 8480a74572883a4657e11606b4cb8dcd5532ea3a..2b12eadec96fa93d576ebf9036a2b2b1af9acc59 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,7 +28,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ModelPrunerTest : public ::testing::Test {};
+class ModelPrunerTest : public GrapplerTest {};
 
 TEST_F(ModelPrunerTest, NoPruning) {
   // This trivial graph is so basic there's nothing to prune.
@@ -86,6 +88,13 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   EXPECT_EQ(NodeName(b.name()), new_e.input(0));
   EXPECT_EQ(1, new_d.input_size());
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, IdentityPruning) {
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 5893f286ed267f0600a40ef58eeff9f98b472e2e..7398d2c896dc01ca35a949be3942509b9b35608d 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -142,38 +142,12 @@ bool IsSameInput(const string& name1, const string& name2) {
     return true;
   }
   int position1;
-  string node1 = ParseNodeName(name1, &position1);
+  StringPiece node1 = ParseNodeNameAsStringPiece(name1, &position1);
   int position2;
-  string node2 = ParseNodeName(name2, &position2);
+  StringPiece node2 = ParseNodeNameAsStringPiece(name2, &position2);
   return (position1 == position2) && (node1 == node2);
 }
 
-string ParseNodeName(const string& name, int* position) {
-  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
-  // to get a node name.
-  strings::Scanner scan(name);
-  scan.ZeroOrOneLiteral("^")
-      .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
-      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
-  StringPiece capture;
-  StringPiece remaining;
-  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
-    *position = 0;
-    return "";
-  } else {
-    if (name[0] == '^') {
-      *position = -1;
-    } else if (remaining.empty()) {
-      *position = 0;
-    } else {
-      // Skip the first ':' character.
-      CHECK(strings::safe_strto32(remaining.substr(1), position));
-    }
-    return capture.ToString();
-  }
-}
-
 bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
@@ -185,7 +159,7 @@ string NodeName(const string& name) {
 
 int NodePosition(const string& name) {
   int position;
-  ParseNodeName(name, &position);
+  ParseNodeNameAsStringPiece(name, &position);
   return position;
 }
 
@@ -275,13 +249,20 @@ int NumNonControlInputs(const NodeDef& node) {
 
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
+  int pos;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
     for (const string& node_as_input : output->input()) {
       if (IsControlInput(node_as_input)) {
         break;
       }
-      if (NodeName(node_as_input) == node.name()) {
+      if (node_as_input == node.name()) {
         ++num_outputs;
+      } else {
+        const StringPiece name =
+            ParseNodeNameAsStringPiece(node_as_input, &pos);
+        if (name == node.name()) {
+          ++num_outputs;
+        }
       }
     }
   }
@@ -430,18 +411,28 @@ Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
 }
 
 void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int node_idx,
+    const std::unordered_set<string>& op_types_to_traverse, int root_node,
     std::set<int>* nodes_found) const {
-  if (nodes_found->find(node_idx) != nodes_found->end()) {
-    return;
-  }
-  nodes_found->insert(node_idx);
-  const string& op_type = graph_->node(node_idx).op();
+  nodes_found->clear();
+  const string& op_type = graph_->node(root_node).op();
   if (op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
     return;
   }
-  for (auto output_idx : this->outputs(node_idx)) {
-    DepthFirstSearch(op_types_to_traverse, output_idx, nodes_found);
+  std::vector<int> stack;
+  stack.reserve(32);
+  stack.push_back(root_node);
+  while (!stack.empty()) {
+    const int node_idx = stack.back();
+    stack.pop_back();
+    nodes_found->insert(node_idx);
+    const string& op_type = graph_->node(node_idx).op();
+    if (op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
+      for (auto output_idx : this->outputs(node_idx)) {
+        if (nodes_found->find(output_idx) == nodes_found->end()) {
+          stack.push_back(output_idx);
+        }
+      }
+    }
   }
 }
 
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 11555d712abd1de538aa8526f1574f249f630cbf..b15667dca26968cce0cbdb5c6f52af07a7417e4f 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -26,8 +26,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -107,8 +109,38 @@ string NodeName(const string& name);
 // Get the trailing position number ":{digits}" (if any) of a node name.
 int NodePosition(const string& name);
 
+inline StringPiece ParseNodeNameAsStringPiece(const string& name,
+                                              int* position) {
+  // Strip the prefix '^' (if any), and strip the trailing ":{digits} (if any)
+  // to get a node name.
+  strings::Scanner scan(name);
+  scan.ZeroOrOneLiteral("^")
+      .RestartCapture()
+      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+      .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  StringPiece capture;
+  StringPiece remaining;
+  if (scan.Peek(':') != ':' || !scan.GetResult(&remaining, &capture)) {
+    *position = 0;
+    static const string empty;
+    return StringPiece(empty);
+  } else {
+    if (name[0] == '^') {
+      *position = -1;
+    } else if (remaining.empty()) {
+      *position = 0;
+    } else {
+      // Skip the first ':' character.
+      CHECK(strings::safe_strto32(remaining.substr(1), position));
+    }
+    return capture;
+  }
+}
+
 // Returns the node name and position in a single call.
-string ParseNodeName(const string& name, int* position);
+inline string ParseNodeName(const string& name, int* position) {
+  return ParseNodeNameAsStringPiece(name, position).ToString();
+}
 
 // Add a prefix to a node name with a custom delimiter.
 string AddPrefixToNodeName(const string& name, const string& prefix,
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 7419c26dff58067856c5e5280edcecd77a41c6c7..b473f32c4503b8e828f37d6b0a870402f45c9fc8 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -161,8 +161,11 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
@@ -175,6 +178,8 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 4f286ce1c8bc3df4065f39c1744600d457173c2e..e8d423a7595047899426f634717daa6f295f2da2 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -23,130 +23,545 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
 namespace grappler {
 
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library) {
-  if (func.signature().name().empty()) {
-    LOG(ERROR) << "function name must be specified.";
-    return nullptr;
-  }
-  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-  new_item->id = func.signature().name();
-
-  std::unordered_map<string, string> port_map;
-
-  // Add the function inputs as placeholder
-  for (const auto& inp : func.signature().input_arg()) {
-    NodeDef* ph = new_item->graph.add_node();
-    ph->set_name(inp.name());
-    ph->set_op("Placeholder");
-    if (inp.type() != DT_INVALID) {
-      (*ph->mutable_attr())["T"].set_type(inp.type());
+namespace {
+
+Status OutputNameRange(const FunctionLibraryDefinition& flib,
+                       const NodeDef& node,
+                       tensorflow::NameRangeMap* outputs_range_map) {
+  const OpRegistrationData* registration;
+  TF_RETURN_IF_ERROR(flib.LookUp(node.op(), &registration));
+  TF_RETURN_IF_ERROR(tensorflow::NameRangesForNode(node, registration->op_def,
+                                                   nullptr, outputs_range_map));
+  return Status::OK();
+}
+
+Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
+                                   const NodeDef& node,
+                                   GrapplerFunctionConnectivity* connectivity) {
+  tensorflow::NameRangeMap outputs_range_map;
+  TF_RETURN_IF_ERROR(OutputNameRange(flib, node, &outputs_range_map));
+  connectivity->RegisterFunctionBodyOutputs(node.name(), outputs_range_map);
+  return Status::OK();
+}
+
+// Replace the placeholder attribute values with the values specified in
+// instantiation attributes.
+Status ResolveFunctionBodyNodeAttrPlaceholders(
+    const AttrValueMap& func_instantiation_attr, NodeDef* node) {
+  for (auto& attr : *node->mutable_attr()) {
+    const string& placeholder = attr.second.placeholder();
+    if (placeholder.empty()) continue;
+
+    auto it = func_instantiation_attr.find(placeholder);
+    if (it != func_instantiation_attr.end()) {
+      attr.second = it->second;
     } else {
-      auto it = func_attr.find(inp.type_attr());
-      if (it == func_attr.end()) {
-        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-                   << " for function input " << inp.name();
-        return nullptr;
+      return errors::InvalidArgument("Can't resolve placeholder: ",
+                                     placeholder);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
+    const InputArgExpansion& input_arg_expansion) {
+  const auto& input_name = input_arg_expansion.input_name;
+  const auto& placeholders = input_arg_expansion.placeholders;
+  input_arg_expansions_.emplace(input_name, input_arg_expansion);
+  for (int i = 0; i < placeholders.size(); ++i) {
+    const string& placeholder = input_arg_expansion.placeholders[i];
+    input_arg_placeholders_.emplace(
+        placeholder, InputArgPlaceholder{input_name, /*position=*/i});
+  }
+}
+
+void GrapplerFunctionConnectivity::RegisterFunctionBodyOutputs(
+    const string& node_name, const tensorflow::NameRangeMap& outputs) {
+  function_body_outputs_[node_name] = outputs;
+}
+
+Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
+    const string& func_def_input, std::vector<string>* graph_def_inputs) const {
+  using ::tensorflow::strings::Scanner;
+
+  if (IsControlInput(func_def_input)) {
+    graph_def_inputs->push_back(func_def_input);
+    return Status::OK();
+  }
+
+  // Parse input format: "node_name[:node_output][:position]"
+  string node_name;
+  string node_output;
+  int position = -1;
+
+  StringPiece capture;
+  StringPiece remaining;
+
+  // Parse "node_name"
+  if (Scanner(func_def_input)
+          .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
+          .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_name = string(capture.data(), capture.size());
+  }
+
+  // Parse "node_output" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .One(strings::Scanner::LOWERLETTER)
+          .Any(strings::Scanner::LETTER_DIGIT_UNDERSCORE)
+          .GetResult(&remaining, &capture)) {
+    node_output = string(capture.data(), capture.size());
+  }
+
+  // Parse "position" if it exists
+  if (Scanner(remaining)
+          .OneLiteral(":")
+          .RestartCapture()
+          .Many(strings::Scanner::DIGIT)
+          .GetResult(nullptr, &capture)) {
+    CHECK(strings::safe_strto32(capture, &position));
+  }
+
+  // If "node_output" is not empty, it must be an output of a function body node
+  bool is_function_body_output = !node_output.empty();
+
+  // Function input argument: "node_name[:position]"
+  if (!is_function_body_output) {
+    auto input_arg = input_arg_expansions_.find(node_name);
+    if (input_arg != input_arg_expansions_.end()) {
+      const InputArgExpansion& input_arg_expansion = input_arg->second;
+      const auto& placeholders = input_arg_expansion.placeholders;
+
+      if (position == -1) {
+        // If position is not defined use all placeholders
+        graph_def_inputs->reserve(placeholders.size());
+        for (const string& placeholder : placeholders) {
+          graph_def_inputs->push_back(placeholder);
+        }
       } else {
-        (*ph->mutable_attr())["T"] = it->second;
+        if (position > input_arg_expansion.placeholders.size() - 1) {
+          return errors::InvalidArgument("Invalid input ", node_name,
+                                         "position: ", position,
+                                         " (out of range)");
+        }
+        graph_def_inputs->push_back(input_arg_expansion.placeholders[position]);
       }
+
+      return Status::OK();
     }
-    port_map[inp.name()] = inp.name();
   }
 
-  // Add the function body to the graph.
-  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+  // Function body output: "node_name:node_output[:position]"
+  if (is_function_body_output) {
+    auto function_body_outputs = function_body_outputs_.find(node_name);
+    if (function_body_outputs != function_body_outputs_.end()) {
+      const tensorflow::NameRangeMap& outputs = function_body_outputs->second;
+      auto output = outputs.find(node_output);
+      if (output != outputs.end()) {
+        const auto& output_range = output->second;
+
+        if (position == -1) {
+          // If position is not defined expand node output range
+          for (int i = output_range.first; i < output_range.second; ++i) {
+            i == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", i));
+          }
+        } else {
+          if (position > (output_range.second - output_range.first)) {
+            return errors::InvalidArgument(
+                "Invalid node ", node_name, " output ", node_output,
+                " position: ", position, " (out of range)");
+          }
+          int pos = output_range.first + position;
+          pos == 0 ? graph_def_inputs->push_back(node_name)
+                   : graph_def_inputs->push_back(
+                         strings::StrCat(node_name, ":", pos));
+        }
 
-  for (const NodeDef& node : func.node_def()) {
-    NodeDef* new_node = new_item->graph.add_node();
-    *new_node = node;
-    // Replace the placeholder attribute values with the specified value.
-    for (auto& attr : *new_node->mutable_attr()) {
-      const string& ph_name = attr.second.placeholder();
-      auto it = func_attr.find(ph_name);
-      if (it != func_attr.end()) {
-        attr.second = it->second;
+        return Status::OK();
       }
     }
+  }
 
-    // Functions use a custom format to encode connectivity. Map these custom
-    // strings to regular ones.
-    const OpRegistrationData* registration;
-    Status status = func_def.LookUp(node.op(), &registration);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-      return nullptr;
-    }
+  return errors::InvalidArgument("Failed to expand a function def input: ",
+                                 func_def_input);
+}
+
+Status GrapplerFunctionConnectivity::ExpandNodeInputs(
+    NodeDef* function_body_node) const {
+  std::vector<string> expanded_inputs;
+
+  for (const string& function_def_input : function_body_node->input()) {
+    TF_RETURN_IF_ERROR(
+        ExpandFunctionDefInput(function_def_input, &expanded_inputs));
+  }
+
+  function_body_node->clear_input();
+  for (const string& expanded_input : expanded_inputs)
+    function_body_node->add_input(expanded_input);
+  return Status::OK();
+}
+
+Status GrapplerFunctionConnectivity::AsFunctionDefInput(
+    const string& graph_def_input, string* func_def_input) const {
+  using gtl::FindOrNull;
+
+  if (IsControlInput(graph_def_input)) {
+    *func_def_input = graph_def_input;
+    return Status::OK();
+  }
+
+  int position;
+  string node_name = ParseNodeName(graph_def_input, &position);
+  CHECK_GE(position, 0);
 
-    tensorflow::NameRangeMap inputs;
-    tensorflow::NameRangeMap outputs;
-    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-                                           &outputs);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-      return nullptr;
+  // Check if it's an input arg placeholder
+  if (position == 0) {
+    const InputArgPlaceholder* placeholder =
+        FindOrNull(input_arg_placeholders_, node_name);
+    if (placeholder != nullptr) {
+      *func_def_input =
+          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      return Status::OK();
     }
-    for (const auto& name_range : outputs) {
-      string port_prefix =
-          strings::StrCat(node.name(), ":", name_range.first, ":");
-      int index_start = name_range.second.first;
-      int index_end = name_range.second.second;
-      for (int i = index_start; i < index_end; ++i) {
-        string port_id = strings::StrCat(port_prefix, i - index_start);
-        string port_name = strings::StrCat(node.name(), ":", i);
-        port_map[port_id] = port_name;
+  }
+
+  // It must be output from one of the function body nodes
+  const tensorflow::NameRangeMap* outputs_range_map =
+      FindOrNull(function_body_outputs_, node_name);
+  if (outputs_range_map != nullptr) {
+    for (const auto& el : *outputs_range_map) {
+      const auto& output_name = el.first;
+      const auto& output_range = el.second;
+      if (position >= output_range.first && position < output_range.second) {
+        int pos = position - output_range.first;
+        *func_def_input =
+            strings::StrCat(node_name, ":", output_name, ":", pos);
+        return Status::OK();
       }
     }
   }
 
-  for (auto& node : *new_item->graph.mutable_node()) {
-    // Rewrite the inputs to use the normal naming convention.
-    for (int i = 0; i < node.input_size(); ++i) {
-      const string& input = node.input(i);
-      if (IsControlInput(input)) {
-        // No need to remap control dependencies.
-        continue;
-      } else {
-        auto it = port_map.find(input);
-        if (it == port_map.end()) {
-          LOG(ERROR) << "Unknown input: " << input;
-          return nullptr;
-        }
-        node.set_input(i, it->second);
-      }
+  return errors::InvalidArgument("Unknown graph def input: ", graph_def_input);
+}
+
+Status GrapplerFunctionConnectivity::AsFunctionDefNode(
+    NodeDef* function_body_node) const {
+  string func_def_input;
+
+  for (int i = 0; i < function_body_node->input_size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        AsFunctionDefInput(function_body_node->input(i), &func_def_input));
+    function_body_node->set_input(i, func_def_input);
+  }
+
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemInstantiation::GetTypeAttr(
+    const string& type_attr_name, DataType* data_type) const {
+  auto it = func_instantiation_attr_->find(type_attr_name);
+  if (it == func_instantiation_attr_->end()) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined");
+  } else if (it->second.type() == DT_INVALID) {
+    return errors::InvalidArgument("Type attribute ", type_attr_name,
+                                   " is not defined with a valid type");
+  } else {
+    *data_type = it->second.type();
+  }
+  return Status::OK();
+}
+
+Status GrapplerFunctionItemInstantiation::GetArgType(
+    const OpDef::ArgDef& arg, DataType* data_type) const {
+  if (arg.type() != DT_INVALID) {
+    *data_type = arg.type();
+  } else {
+    if (!arg.type_list_attr().empty() || !arg.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Arguments with sequence of tensors are not supported. Unsupported "
+          "argument name: ",
+          arg.name());
     }
+    TF_RETURN_IF_ERROR(GetTypeAttr(arg.type_attr(), data_type));
   }
+  return Status::OK();
+}
 
-  // Add the function outputs to the list of fetch nodes, taking into account
-  // the output mapping if any.
-  for (const auto& out : func.signature().output_arg()) {
-    auto it = func.ret().find(out.name());
-    if (it != func.ret().end()) {
-      auto it2 = port_map.find(it->second);
-      if (it2 == port_map.end()) {
-        LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
-                   << it->second;
-        return nullptr;
-      } else {
-        new_item->fetch.emplace_back(it2->second);
-      }
-    } else {
-      new_item->fetch.emplace_back(out.name());
+GrapplerFunctionItem::GrapplerFunctionItem(
+    const string& func_name, const AttrValueMap& func_attr,
+    const std::vector<InputArgExpansion>& input_arg_expansions,
+    const std::vector<OutputArgExpansion>& output_arg_expansions,
+    GraphDef&& function_body)
+    : func_attr_(func_attr),
+      input_arg_expansions_(input_arg_expansions),
+      output_arg_expansions_(output_arg_expansions) {
+  id = func_name;
+  // Fill the feed nodes with input placeholders
+  for (const InputArgExpansion& input_arg : input_arg_expansions_) {
+    for (const string& placeholder : input_arg.placeholders) {
+      feed.emplace_back(placeholder, Tensor());
+      input_arg_placeholders_.insert(placeholder);
+    }
+  }
+  // Fill the fetch nodes with outputs
+  for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
+    for (const string& output_tensor : output_arg.output_tensors) {
+      fetch.push_back(output_tensor);
+    }
+  }
+  // Swap the graph body
+  graph.Swap(&function_body);
+}
+
+const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
+  return input_arg_expansions_;
+}
+
+const InputArgExpansion& GrapplerFunctionItem::input(int i) const {
+  return input_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::input_size() const {
+  return input_arg_expansions_.size();
+}
+
+bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
+  return input_arg_placeholders_.find(node_name) !=
+         input_arg_placeholders_.end();
+}
+
+const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
+  return output_arg_expansions_;
+}
+
+const OutputArgExpansion& GrapplerFunctionItem::output(int i) const {
+  return output_arg_expansions_[i];
+}
+
+const std::size_t GrapplerFunctionItem::output_size() const {
+  return output_arg_expansions_.size();
+}
+
+const AttrValueMap& GrapplerFunctionItem::func_attr() const {
+  return func_attr_;
+}
+
+const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
+
+GraphDef& GrapplerFunctionItem::mutable_function_body() { return graph; }
+
+GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
+  graph.Swap(&other);
+  return *this;
+}
+
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
+  std::vector<string> output_tensors;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& tensor : output.output_tensors) {
+      output_tensors.push_back(tensor);
+    }
+  }
+  return output_tensors;
+}
+
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const AttrValueMap& func_instantiation_attr,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item) {
+  const OpDef& signature = func.signature();
+
+  if (signature.name().empty()) {
+    return errors::InvalidArgument("Function name must be specified");
+  }
+
+  // Function types will be resolved from function instantiation attributes. All
+  // other attributes will be lost during conversion to FunctionDef.
+  for (const OpDef::AttrDef& attr : signature.attr()) {
+    if (attr.type() != "type") {
+      return errors::InvalidArgument(
+          "Function signature must have only type attributes");
+    }
+  }
+
+  // Helper methods to lookup function instantiation attributes
+  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+
+  // Mapping from FunctionDef input format (name[:output][:position]) to
+  // GraphDef input format (name[:position])
+  GrapplerFunctionConnectivity connectivity;
+
+  std::vector<InputArgExpansion> inputs;
+  std::vector<OutputArgExpansion> outputs;
+
+  // Function body shares the library with the graph that instantiated it.
+  GraphDef function_body;
+  *function_body.mutable_library() = flib.ToProto();
+
+  // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
+
+  // Make sure that there is no tensor sequences in outputs
+  for (const OpDef::ArgDef& output : signature.output_arg()) {
+    if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Outputs with sequence of tensors are not supported. Unsupported "
+          "output: ",
+          output.name());
     }
   }
-  // Add the function inputs to the list of feeds.
-  for (const auto& inp : func.signature().input_arg()) {
-    new_item->feed.emplace_back(inp.name(), Tensor());
+
+  // For each input argument create a placeholder in function body.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with sequence of tensors are not supported. Unsupported "
+          "input: ",
+          input.name());
+    }
+
+    DataType input_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
+
+    NodeDef* placeholder = function_body.add_node();
+    placeholder->set_name(input.name());
+    placeholder->set_op("Placeholder");
+    (*placeholder->mutable_attr())["T"].set_type(input_data_type);
+
+    InputArgExpansion input_expansion{/*input_name=*/input.name(),
+                                      /*data_type=*/input_data_type,
+                                      /*placeholders=*/{input.name()}};
+    connectivity.RegisterInputArgExpansion(input_expansion);
+    inputs.push_back(input_expansion);
+  }
+
+  // Add all function nodes to the function body
+  for (const NodeDef& func_def_node : func.node_def()) {
+    NodeDef* new_node = function_body.add_node();
+    *new_node = func_def_node;
+
+    // Resolve all placeholder values using function instantiation attributes.
+    TF_RETURN_IF_ERROR(ResolveFunctionBodyNodeAttrPlaceholders(
+        func_instantiation_attr, new_node));
+    // Register node output range in a function connectivity.
+    TF_RETURN_IF_ERROR(
+        RegisterFunctionBodyOutputs(flib, func_def_node, &connectivity));
+  }
+
+  // Rewrite inputs to use GraphDef format
+  for (NodeDef& node : *function_body.mutable_node()) {
+    TF_RETURN_IF_ERROR(connectivity.ExpandNodeInputs(&node));
+  }
+
+  // Add function outputs
+  for (const OpDef::ArgDef& out : signature.output_arg()) {
+    std::vector<string> output_tensors;
+    auto ret = func.ret().find(out.name());
+    TF_RETURN_IF_ERROR(
+        ret != func.ret().end()
+            // Expand outputs using provided output mapping
+            ? connectivity.ExpandFunctionDefInput(ret->second, &output_tensors)
+            // Otherwise output must be one of the function inputs
+            : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
+
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
+    OutputArgExpansion output{/*output_name=*/out.name(),
+                              /*data_type=*/output_data_type,
+                              /*output_tensors=*/output_tensors};
+    outputs.push_back(output);
+  }
+
+  *item = GrapplerFunctionItem(
+      /*func_name=*/signature.name(),
+      /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
+      inputs, outputs, std::move(function_body));
+  return Status::OK();
+}
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity) {
+  for (const InputArgExpansion& input : item.inputs()) {
+    connectivity->RegisterInputArgExpansion(input);
+  }
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    TF_RETURN_IF_ERROR(
+        RegisterFunctionBodyOutputs(flib, func_body_node, connectivity));
+  }
+  return Status::OK();
+}
+
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func) {
+  func->mutable_signature()->set_name(item.id);
+
+  // Build a GrapplerFunctionConnectivity from inputs and new function body.
+  GrapplerFunctionConnectivity connectivity;
+  TF_RETURN_IF_ERROR(
+      RegisterGrapplerFunctionConnectivity(item, flib, &connectivity));
+
+  // Add function input arguments.
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(input_arg.input_name);
+    arg_def.set_type(input_arg.data_type);
+    *func->mutable_signature()->add_input_arg() = arg_def;
+  }
+
+  // Add function output arguments.
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    OpDef::ArgDef arg_def;
+    arg_def.set_name(output_arg.output_name);
+    arg_def.set_type(output_arg.data_type);
+    *func->mutable_signature()->add_output_arg() = arg_def;
+
+    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
+        << "Outputs of tensor sequences are not supported";
+
+    string ret;
+    for (const string& output_tensor : output_arg.output_tensors) {
+      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
+      (*func->mutable_ret())[output_arg.output_name] = ret;
+    }
+  }
+
+  // Copy function definition specific attributes.
+  for (const auto& attr : item.func_attr()) {
+    const auto& attr_name = attr.first;
+    const auto& attr_value = attr.second;
+    (*func->mutable_attr())[attr_name] = attr_value;
+  }
+
+  // Copy function body nodes to the FunctionDef and update input format
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    // Do not copy input placeholders
+    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+
+    NodeDef* func_def_node = func->add_node_def();
+    *func_def_node = func_body_node;
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
   }
 
-  return new_item;
+  return Status::OK();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 8f9b7d848a89435e1839e540f33d87213beb8a45..2ac3917a66f376ad879acff437547ef3d469c930 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -19,19 +19,173 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
-
 namespace grappler {
 
-// Factory method for creating a GrapplerItem from a FunctionDef.
-// Returns nullptr if the given function def cannot be converted.
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+using AttrValueMap = std::unordered_map<string, AttrValue>;
+
+// Depending on the function instantiation attributes, input argument to the
+// function might be a single tensor, list of tensors of the same type, or a
+// list of tensors of different types.
+//
+// InputArgExpansion keeps track of the placeholders that were added to the
+// function body in place of function inputs and a resolved input data type.
+struct InputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
+  // different data types
+  string input_name;                 // name of the function input argument
+  DataType data_type;                // input data type
+  std::vector<string> placeholders;  // names of placeholder nodes in the
+                                     // function body
+};
+
+// Depending on the function instantiation attributes, output argument is mapped
+// to one or more outputs of one of the function body nodes.
+//
+// OutputArgExpansion keeps mapping from a function output arg to the output
+// tensors of a function body nodes and a resolved output data type
+struct OutputArgExpansion {
+  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
+  // different data types
+  string output_name;                  // name of the function output argument
+  DataType data_type;                  // output data type
+  std::vector<string> output_tensors;  // names of output tensor from the
+                                       // function body nodes
+};
+
+// FunctionDef uses different connectivity encoding for the function body nodes,
+// then a GraphDef (see function.proto for details). Input name in FunctionDef
+// can potentially represent a sequence of tensors (instead just one tensor in
+// GraphDef), we need to expand it when converting from FunctionDef to GraphDef,
+// and fold it back when doing backward conversion.
+class GrapplerFunctionConnectivity {
+ public:
+  void RegisterInputArgExpansion(const InputArgExpansion& input_arg_expansion);
+  void RegisterFunctionBodyOutputs(const string& node_name,
+                                   const tensorflow::NameRangeMap& outputs);
+
+  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // multiple inputs in GraphDef format (name[:position]).
+  Status ExpandFunctionDefInput(const string& func_def_input,
+                                std::vector<string>* graph_def_inputs) const;
+
+  // Update Node inputs from FunctionDef to GraphDef format.
+  Status ExpandNodeInputs(NodeDef* function_body_node) const;
+
+  // When expanding inputs in function def format, single input might be
+  // expanded into multiple tensors. When converting back to the function def
+  // format from graph def format, it's always a 1-to-1 relationship.
+  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // instantiation attributes and length of input args (and node def outputs) is
+  // known.
+
+  // Map from GraphDef input format to FunctionDef input format using registered
+  // input arg expansion and function body outputs.
+  Status AsFunctionDefInput(const string& graph_def_input,
+                            string* func_def_input) const;
+
+  // Update Node inputs from GraphDef to FunctionDef format.
+  Status AsFunctionDefNode(NodeDef* function_body_node) const;
+
+ private:
+  // Mapping from input name to input arg expansion.
+  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  // Mapping from function body node name to output names range map.
+  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+
+  struct InputArgPlaceholder {
+    string input_name;
+    int position;
+  };
+
+  // Mapping from input arg placeholder to the function input tensor.
+  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
+};
+
+// Get Function type attributes using attributes of a node that instantiated
+// a function.
+class GrapplerFunctionItemInstantiation {
+ public:
+  explicit GrapplerFunctionItemInstantiation(
+      const AttrValueMap* func_instantiation_attr)
+      : func_instantiation_attr_(func_instantiation_attr) {}
+
+  // Get DataType from attributes by name. Return error if attribute is missing,
+  // or it doesn't define a valid data type.
+  Status GetTypeAttr(const string& type_attr_name, DataType* data_type) const;
+
+  // Get argument data type. If data type is not explicitly defined, uses
+  // provided attribute name to look it up in function attributes.
+  Status GetArgType(const OpDef::ArgDef& arg, DataType* data_type) const;
+
+ private:
+  const AttrValueMap* func_instantiation_attr_;  // do not own
+};
+
+// A special case of GrapplerItem, constructed from a TensorFlow Function.
+class GrapplerFunctionItem : public GrapplerItem {
+ public:
+  GrapplerFunctionItem() = default;
+  GrapplerFunctionItem(
+      const string& func_name, const AttrValueMap& func_attr,
+      const std::vector<InputArgExpansion>& input_arg_expansions,
+      const std::vector<OutputArgExpansion>& output_arg_expansions,
+      GraphDef&& function_body);
+
+  bool IsInputPlaceholder(const string& node_name) const;
+
+  const std::vector<InputArgExpansion>& inputs() const;
+  const InputArgExpansion& input(int i) const;
+  const std::size_t input_size() const;
+
+  const std::vector<OutputArgExpansion>& outputs() const;
+  const OutputArgExpansion& output(int i) const;
+  const std::size_t output_size() const;
+
+  const AttrValueMap& func_attr() const;
+  const GraphDef& function_body() const;
+  GraphDef& mutable_function_body();
+
+  GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
+
+ private:
+  AttrValueMap func_attr_;  // Attributes specific to function definition that
+                            // produced this item (FuncDef.attr field).
+
+  std::vector<InputArgExpansion> input_arg_expansions_;
+  std::vector<OutputArgExpansion> output_arg_expansions_;
+
+  std::set<string> input_arg_placeholders_;
+};
+
+// Return all output tensors referenced by item output args.
+std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
+
+// Make a GrapplerFunctionItem from the function definition and attributes.
+// Return error if the given function def cannot be converted.
+Status MakeGrapplerFunctionItem(
     const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library);
+    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
+    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+
+// Register GrapplerFunctionItem input arg expansion and function body outputs
+// in the GrapplerFunctionConnectivity.  Use function library definition to
+// lookup function body nodes output names and ranges.
+Status RegisterGrapplerFunctionConnectivity(
+    const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
+    GrapplerFunctionConnectivity* connectivity);
+
+// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function
+// library definition to lookup function body nodes output names and ranges.
+Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item,
+                                  const FunctionLibraryDefinition& flib,
+                                  FunctionDef* func);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 6a7d766b1c6b49f8fc13b3b0294f3e3f8a74eb35..a9a708bf6770c1ce7f68d0f0ce605d4fb6c702fc 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
@@ -28,6 +30,128 @@ namespace {
 
 class FunctionsTest : public ::testing::Test {};
 
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  std::vector<string> inputs;
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputA", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputA", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("inputB_0", inputs[0]);
+  EXPECT_EQ("inputB_1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("inputB:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("inputB_1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Add:z", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Add", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+  EXPECT_EQ("Func:1", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2", &inputs));
+  ASSERT_EQ(2, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+  EXPECT_EQ("Func:3", inputs[1]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o1:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:1", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:0", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:2", inputs[0]);
+
+  inputs.clear();
+  TF_EXPECT_OK(connectivity.ExpandFunctionDefInput("Func:o2:1", &inputs));
+  ASSERT_EQ(1, inputs.size());
+  EXPECT_EQ("Func:3", inputs[0]);
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_AsFunctionDefInput) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+
+  connectivity.RegisterFunctionBodyOutputs("Add", {{"z", {0, 1}}});
+  connectivity.RegisterFunctionBodyOutputs("Func",
+                                           {{"o1", {0, 2}}, {"o2", {2, 4}}});
+
+  string input;
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputA", &input));
+  EXPECT_EQ("inputA:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_0", &input));
+  EXPECT_EQ("inputB:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("inputB_1", &input));
+  EXPECT_EQ("inputB:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Add", &input));
+  EXPECT_EQ("Add:z:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func", &input));
+  EXPECT_EQ("Func:o1:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:1", &input));
+  EXPECT_EQ("Func:o1:1", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:2", &input));
+  EXPECT_EQ("Func:o2:0", input);
+
+  TF_EXPECT_OK(connectivity.AsFunctionDefInput("Func:3", &input));
+  EXPECT_EQ("Func:o2:1", input);
+}
+
+TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandNodeInputs) {
+  GrapplerFunctionConnectivity connectivity;
+
+  connectivity.RegisterInputArgExpansion({"inputA", DT_FLOAT, {"inputA"}});
+  connectivity.RegisterInputArgExpansion(
+      {"inputB", DT_FLOAT, {"inputB_0", "inputB_1"}});
+
+  NodeDef node;
+  node.add_input("inputA:0");
+  node.add_input("inputB");
+
+  TF_EXPECT_OK(connectivity.ExpandNodeInputs(&node));
+
+  EXPECT_EQ(3, node.input_size());
+  EXPECT_EQ("inputA", node.input(0));
+  EXPECT_EQ("inputB_0", node.input(1));
+  EXPECT_EQ("inputB_1", node.input(2));
+}
+
 TEST_F(FunctionsTest, FromSimpleFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -48,37 +172,45 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("XTimesTwo", item->id);
-  EXPECT_EQ(4, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"y:0"}), item->fetch);
-  EXPECT_EQ(1, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
-
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "x") {
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("XTimesTwo", item.id);
+  EXPECT_EQ(4, item.function_body().node_size());
+
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("y", item.output(0).output_name);
+  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "x" && count++) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "two") {
+    } else if (node.name() == "two" && count++) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "scale") {
+    } else if (node.name() == "scale" && count++) {
       EXPECT_EQ("Cast", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("two:0", node.input(0));
-    } else if (node.name() == "y") {
+      EXPECT_EQ("two", node.input(0));
+    } else if (node.name() == "y" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("scale:0", node.input(1));
+      EXPECT_EQ("scale", node.input(1));
     }
   }
+  EXPECT_EQ(4, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
@@ -115,45 +247,53 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
-  CHECK(item);
-  EXPECT_EQ("SubGrad", item->id);
-  EXPECT_EQ(12, item->graph.node_size());
-  EXPECT_EQ(std::vector<string>({"dx:0", "dy:0"}), item->fetch);
-  EXPECT_EQ(3, item->feed.size());
-  EXPECT_EQ("x", item->feed[0].first);
-  EXPECT_EQ("y", item->feed[1].first);
-  EXPECT_EQ("dz", item->feed[2].first);
-
-  for (const NodeDef &node : item->graph.node()) {
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("SubGrad", item.id);
+  EXPECT_EQ(12, item.function_body().node_size());
+
+  ASSERT_EQ(3, item.input_size());
+  EXPECT_EQ("x", item.input(0).input_name);
+  EXPECT_EQ("y", item.input(1).input_name);
+  EXPECT_EQ("dz", item.input(2).input_name);
+
+  ASSERT_EQ(2, item.output_size());
+  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
+  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
+      count++;
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "rx") {
+    } else if (node.name() == "rx" && count++) {
       EXPECT_EQ("BroadcastGradientArgs", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("sx:0", node.input(0));
-      EXPECT_EQ("sy:0", node.input(1));
-    } else if (node.name() == "sum_gx") {
+      EXPECT_EQ("sx", node.input(0));
+      EXPECT_EQ("sy", node.input(1));
+    } else if (node.name() == "sum_gx" && count++) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gx:0", node.input(0));
-      EXPECT_EQ("rx:0", node.input(1));
-    } else if (node.name() == "sum_gy") {
+      EXPECT_EQ("gx", node.input(0));
+      EXPECT_EQ("rx", node.input(1));
+    } else if (node.name() == "sum_gy" && count++) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("gy:0", node.input(0));
+      EXPECT_EQ("gy", node.input(0));
       EXPECT_EQ("rx:1", node.input(1));
     }
   }
+  EXPECT_EQ(6, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
-  FunctionDefLibrary library;
-  *library.add_function() = FunctionDefHelper::Define(
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+  TF_ASSERT_OK(flib.AddFunctionDef(FunctionDefHelper::Define(
       // Name
       "Swap",
       // Args
@@ -164,7 +304,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       {"T: {float, double}"},
       // Nodes
       {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
-       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
+       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}})));
 
   FunctionDef func = FunctionDefHelper::Create(
       // Name
@@ -189,43 +329,47 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
 
-  for (const NodeDef &node : item->graph.node()) {
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "x" || node.name() == "y") {
+      count++;
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "a0") {
+    } else if (node.name() == "a0" && count++) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^x2", node.input(2));
-    } else if (node.name() == "a1") {
+    } else if (node.name() == "a1" && count++) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("a0:0", node.input(0));
+      EXPECT_EQ("a0", node.input(0));
       EXPECT_EQ("a0:1", node.input(1));
-    } else if (node.name() == "x2") {
+    } else if (node.name() == "x2" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("x", node.input(1));
-    } else if (node.name() == "y2") {
+    } else if (node.name() == "y2" && count++) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("y", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^a1", node.input(2));
-    } else if (node.name() == "o") {
+    } else if (node.name() == "o" && count++) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x2:0", node.input(0));
-      EXPECT_EQ("y2:0", node.input(1));
+      EXPECT_EQ("x2", node.input(0));
+      EXPECT_EQ("y2", node.input(1));
     }
   }
+  EXPECT_EQ(7, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
@@ -245,28 +389,31 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       {{"out", "Exp:y:0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  EXPECT_EQ(1, item->fetch.size());
-  EXPECT_EQ("Exp:0", item->fetch[0]);
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
 
-  for (const NodeDef &node : item->graph.node()) {
-    if (node.name() == "in") {
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
+    if (node.name() == "in" && count++) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "Linear_func") {
+    } else if (node.name() == "Linear_func" && count++) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("in", node.input(0));
-    } else if (node.name() == "Exp") {
+    } else if (node.name() == "Exp" && count++) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("Linear_func:0", node.input(0));
+      EXPECT_EQ("Linear_func", node.input(0));
     }
   }
+  EXPECT_EQ(3, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
@@ -285,20 +432,25 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       {{"out0", "in0"}});
 
   std::unordered_map<string, AttrValue> func_attr;
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ("ForwardInputs", item.id);
+  EXPECT_EQ(5, item.function_body().node_size());
 
-  EXPECT_EQ(3, item->fetch.size());
-  EXPECT_EQ("in0", item->fetch[0]);
-  EXPECT_EQ("arg2", item->fetch[1]);
-  EXPECT_EQ("arg3", item->fetch[2]);
+  EXPECT_EQ(3, item.output_size());
+  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
+  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
+  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
 
-  EXPECT_EQ(5, item->graph.node_size());
-  for (const NodeDef &node : item->graph.node()) {
+  int count = 0;
+  for (const NodeDef &node : item.function_body().node()) {
     EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
                 node.name() == "arg2" || node.name() == "arg3" ||
                 node.name() == "arg4");
+    count++;
     EXPECT_EQ("Placeholder", node.op());
     if (node.name() == "arg3") {
       EXPECT_EQ(DT_INT32, node.attr().at("T").type());
@@ -306,6 +458,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
     }
   }
+  EXPECT_EQ(5, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
@@ -325,24 +478,121 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
-  FunctionDefLibrary library;
-  std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr, library);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
-  EXPECT_EQ(0, item->feed.size());
-  EXPECT_EQ(1, item->fetch.size());
-  EXPECT_EQ("o:0", item->fetch[0]);
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  EXPECT_EQ(2, item->graph.node_size());
-  const NodeDef &two = item->graph.node(0);
+  EXPECT_EQ(0, item.input_size());
+  EXPECT_EQ(1, item.output_size());
+  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+
+  EXPECT_EQ(2, item.function_body().node_size());
+  const NodeDef &two = item.function_body().node(0);
   EXPECT_EQ("two", two.name());
   EXPECT_EQ(0, two.input_size());
-  const NodeDef &cast = item->graph.node(1);
+  const NodeDef &cast = item.function_body().node(1);
   EXPECT_EQ("o", cast.name());
   EXPECT_EQ(1, cast.input_size());
-  EXPECT_EQ("two:0", cast.input(0));
+  EXPECT_EQ("two", cast.input(0));
+}
+
+TEST_F(FunctionsTest, MakeSpecializedFunctionDef) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Input and output types are resolved based on instantiation attributes.
+  EXPECT_EQ("x", specialized.signature().input_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().input_arg(0).type());
+  EXPECT_EQ("y", specialized.signature().output_arg(0).name());
+  EXPECT_EQ(DT_FLOAT, specialized.signature().output_arg(0).type());
+
+  // Function body specialized for instantiation types
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "scale" && count++) {
+      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+      EXPECT_EQ("scale:y:0", node.input(1));
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+    }
+  }
+  EXPECT_EQ(2, count);
+}
+
+TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) {
+  using test::function::NDef;
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GraphDef id_func_body = test::function::GDef(
+      {/* pass input to output through identity */
+       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+
+  FunctionDefLibrary lib_def;
+  *lib_def.add_function() = func;
+  *lib_def.add_function() = mul_func;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  // Replace function body with identity function
+  item.SwapFunctionBody(std::move(id_func_body));
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized));
+
+  // Check that graph body was updated.
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "output" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ("x:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(1, count);
 
-  std::cout << item->graph.DebugString() << std::endl;
+  // And return tensor mapping was updated with a new output name (z->output).
+  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 8d8ff4da3a8df5a2868f1a3a0ac6a5d0c2fd66ad..a8e464d09d6c2e6730bea6c92d769c378cb8ac86 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -26,24 +26,24 @@ namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-Status TopologicalSort(GraphDef* graph) {
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<int>* ready_nodes) {
   SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
 
-  std::vector<int> ready_nodes;
-  ready_nodes.reserve(graph_view.num_nodes());
+  ready_nodes->reserve(graph_view.num_nodes());
 
   int front = 0;
   int back = 0;
   std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
   for (int i = 0; i < graph_view.num_nodes(); i++) {
     if (graph_view.inputs(i).empty()) {
-      ready_nodes.push_back(i);
+      ready_nodes->push_back(i);
       back++;
     }
-    if (IsMerge(graph->node(i))) {
+    if (IsMerge(graph.node(i))) {
       for (int input : graph_view.inputs(i)) {
-        if (IsNextIteration(graph->node(input))) {
+        if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
       }
@@ -51,11 +51,11 @@ Status TopologicalSort(GraphDef* graph) {
   }
 
   while (front != back) {
-    int ready_node = ready_nodes[front];
+    int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.outputs(ready_node)) {
       ++num_ready_inputs[fanout];
       if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
-        ready_nodes.push_back(fanout);
+        ready_nodes->push_back(fanout);
         ++back;
       }
     }
@@ -66,7 +66,24 @@ Status TopologicalSort(GraphDef* graph) {
     return errors::InvalidArgument(
         "The graph couldn't be sorted in topological order.");
   }
+  return Status::OK();
+}
 
+Status ComputeTopologicalOrder(
+    const GraphDef& graph,
+    std::unordered_map<const NodeDef*, int>* topo_order) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &ready_nodes));
+  topo_order->reserve(graph.node_size());
+  for (int i = 0; i < ready_nodes.size(); ++i) {
+    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+  }
+  return Status::OK();
+}
+
+Status TopologicalSort(GraphDef* graph) {
+  std::vector<int> ready_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index 7700fe41e40e6d1111c9e84aabfd2a05968ef882..668c88dc751c8723c624822233af2a1b5b2fedf7 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -22,6 +22,10 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Compute a topological ordering for the graph nodes.
+Status ComputeTopologicalOrder(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order);
+
 // Sort a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index c96f15b0e8424d70e8dd1393cf254b52f69200d2..f5c95009d240f321d700c4123486fc3f2b49c3f4 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -52,8 +52,19 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    std::cout << "Node " << node_name << " at order " << topo_order
+              << std::endl;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
@@ -68,8 +79,17 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+
+  const std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1857d8d65571524f79c02eedf05a73cd2adedd03..24131cb51e2eb43a4b4259a9f108ad31e772fc2a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -131,6 +131,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "collective_ops",
+    prefix = "collective_ops",
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "concat_lib",
     srcs = [
@@ -606,6 +617,7 @@ cc_library(
         ":batch_space_ops",
         ":bcast_ops",
         ":bitcast_op",
+        ":broadcast_to_op",
         ":concat_op",
         ":constant_op",
         ":depth_space_ops",
@@ -657,6 +669,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "broadcast_to_op",
+    prefix = "broadcast_to_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "concat_op",
     prefix = "concat_op",
@@ -1395,6 +1413,7 @@ tf_kernel_library(
     visibility = [":friends"],
     deps = [
         ":bounds_check",
+        ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -4311,6 +4330,7 @@ tf_kernel_library(
     deps = [
         ":random_op",
         ":random_ops",
+        ":stateless_random_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -5121,6 +5141,9 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
+            "decode_proto_op.cc",
+            "encode_proto_op.cc",
+            "rpc_op.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
@@ -5928,8 +5951,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5944,8 +5966,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5961,8 +5982,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5982,8 +6002,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5999,8 +6018,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6016,8 +6034,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6025,8 +6042,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_fused_batch_norm_op.cc"],
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6034,8 +6050,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6043,8 +6058,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_concat_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6052,8 +6066,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6061,8 +6074,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_identity_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6070,8 +6082,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_lrn_op",
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6153,6 +6164,50 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "decode_proto_op",
+    srcs = [
+        "decode_proto_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:decode",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "encode_proto_op",
+    srcs = ["encode_proto_op.cc"],
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "rpc_op",
+    srcs = [
+        "rpc_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:rpc_ops_op_lib",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+        "//third_party/eigen3",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 106ceedc00721f51468639a1c9e235728db8dbae..55599de7315a4610cdbc5937e719c0bd2b4d9c34 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -100,6 +100,10 @@ message GrowingMetadata {
   // Number of layers that we have attempted to build. After pruning, these
   // layers might have been removed.
   int64 num_layers_attempted = 2;
+  // The start (inclusive) and end (exclusive) ids of the nodes in the latest
+  // layer of the latest tree.
+  int32 last_layer_node_start = 3;
+  int32 last_layer_node_end = 4;
 }
 
 // TreeEnsemble describes an ensemble of decision trees.
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index f49242d8566f09d34088131b7f74ea4362a86860..563f7b8b08c8969f7fb14c9e59b9fcef166312ce 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -99,6 +99,7 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     Tensor* output_num_trees_t = nullptr;
     Tensor* output_num_finalized_trees_t = nullptr;
     Tensor* output_num_attempted_layers_t = nullptr;
+    Tensor* output_last_layer_nodes_range_t = nullptr;
 
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
                                                      &output_stamp_token_t));
@@ -110,11 +111,22 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(3, TensorShape(),
                                             &output_num_attempted_layers_t));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                4, {2}, &output_last_layer_nodes_range_t));
 
     output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
     output_num_trees_t->scalar<int32>()() = num_trees;
     output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
     output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+
+    int32 range_start;
+    int32 range_end;
+    tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end);
+
+    output_last_layer_nodes_range_t->vec<int32>()(0) = range_start;
+    // For a completely empty ensemble, this will be 0. To make it a valid range
+    // we add this max cond.
+    output_last_layer_nodes_range_t->vec<int32>()(1) = std::max(1, range_end);
   }
 };
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index c82588b9507800a860e6fc7af4a51541f09cad5b..561ca3a18a7ff1e14faad4ed463209110b026401 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -93,6 +93,26 @@ class BoostedTreesEnsembleResource : public StampedResource {
         new_num_layers);
   }
 
+  void UpdateLastLayerNodesRange(const int32 node_range_start,
+                                 int32 node_range_end) const {
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start(
+        node_range_start);
+    tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end(
+        node_range_end);
+  }
+
+  void GetLastLayerNodesRange(int32* node_range_start,
+                              int32* node_range_end) const {
+    *node_range_start =
+        tree_ensemble_->growing_metadata().last_layer_node_start();
+    *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end();
+  }
+
+  int64 GetNumNodes(const int32 tree_id) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->trees(tree_id).nodes_size();
+  }
+
   void UpdateGrowingMetadata() const;
 
   int32 GetNumLayersAttempted() {
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 33fdab6a860358fab05abbb361bf004174e85658..16e65cf2843b2f2a6945d506718c3bb453fdc14a 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -42,8 +42,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
     const Tensor* node_id_range_t;
     OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
     const auto node_id_range = node_id_range_t->vec<int32>();
-    int32 node_id_first = node_id_range(0);
-    int32 node_id_last = node_id_range(1);  // inclusive.
+    const int32 node_id_first = node_id_range(0);  // inclusive
+    const int32 node_id_last = node_id_range(1);   // exclusive
     // stats_summary_list
     OpInputList stats_summary_list;
     OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
@@ -86,7 +86,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
       std::vector<int32> output_thresholds;
       std::vector<float> output_left_node_contribs;
       std::vector<float> output_right_node_contribs;
-      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+      for (int node_id = node_id_first; node_id < node_id_last; ++node_id) {
         // Calculate gains.
         cum_grad.clear();
         cum_hess.clear();
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index b9ded4054aced4bcb27b0590a44e1f86f6b0a1c2..67cac14c520ac18b4c8c547a83850ccb0ce01eae 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -101,6 +101,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
             << current_tree << " of ensemble of " << current_tree + 1
             << " trees.";
     bool split_happened = false;
+    int32 node_id_start = ensemble_resource->GetNumNodes(current_tree);
     // Add the splits to the tree.
     for (auto& split_entry : best_splits) {
       const int32 node_id = split_entry.first;
@@ -139,11 +140,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           right_contrib, &left_node_id, &right_node_id);
       split_happened = true;
     }
+    int32 node_id_end = ensemble_resource->GetNumNodes(current_tree);
     if (split_happened) {
       // Update growable tree metadata.
       ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
       // Finalize the tree if needed.
       if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        // If the tree is finalized, next growing will start from node 0;
+        node_id_start = 0;
+        node_id_end = 1;
         ensemble_resource->SetIsFinalized(current_tree, true);
         if (pruning_mode_ == kPostPruning) {
           ensemble_resource->PostPruneTree(current_tree);
@@ -153,6 +158,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
           ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
         }
       }
+      // If we managed to split, update the node range. If we didn't, don't
+      // update as we will try to split the same nodes with new instances.
+      ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end);
     }
   }
 
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2810925bbcd645f60af0e6025a74043cd45f21e7
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class BroadcastToOp : public OpKernel {
+ public:
+  explicit BroadcastToOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+
+    const Tensor& shape_tensor = ctx->input(1);
+
+    TensorShape output_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    const Device& d = ctx->eigen_device<Device>();
+    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape);
+  }
+};
+
+// As MakeShape is able to handle both DT_INT32 and DT_INT64,
+// no need to have TypeConstraint for `Tidx`
+#define REGISTER_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BroadcastTo").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BroadcastToOp<CPUDevice, type>);
+
+TF_CALL_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_TEMPLATE(Type)                              \
+  template <>                                                   \
+  void BroadcastTo<GPUDevice, Type>::operator()(                \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
+      const TensorShape& output_shape, const Tensor& input,     \
+      const TensorShape& input_shape);                          \
+  extern template struct BroadcastTo<GPUDevice, Type>;
+
+TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
+#undef DECLARE_GPU_KERNEL
+}  // namespace functor
+
+#define REGISTER_KERNEL(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("BroadcastTo")            \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("shape"),      \
+                          BroadcastToOp<GPUDevice, type>);
+
+TF_CALL_GPU_ALL_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..608e9b6ac9c161ca2fb95897da371d355f0efe9f
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -0,0 +1,220 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BroadcastTo {
+  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
+                  const TensorShape &output_shape, const Tensor &input_tensor,
+                  const TensorShape &input_shape) {
+#define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
+  for (int i = 0; i < NDIMS; i++) {                                           \
+    OP_REQUIRES(ctx, (broadcast[i] % reshape[i] == 0),                        \
+                errors::InvalidArgument("invalid shape to broadcast from ",   \
+                                        input_shape.DebugString(), " to ",    \
+                                        output_shape.DebugString()));         \
+    broadcast[i] = broadcast[i] / reshape[i];                                 \
+  }
+
+    switch (output_shape.dims()) {
+      case 1: {
+        auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<1>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 1, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 1>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 2: {
+        auto reshape = AsEigenDSizesWithPrefix<2>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<2>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 2, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 2>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 3: {
+        auto reshape = AsEigenDSizesWithPrefix<3>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<3>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 3, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 3>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 4: {
+        auto reshape = AsEigenDSizesWithPrefix<4>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<4>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
+
+        auto output = output_tensor.tensor<T, 4>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      case 5: {
+        auto reshape = AsEigenDSizesWithPrefix<5>(input_shape);
+        auto broadcast = output_shape.AsEigenDSizes<5>();
+
+        BROADCAST_SHAPE(broadcast, reshape, 5, input_shape, output_shape);
+        auto output = output_tensor.tensor<T, 5>();
+        switch (input_shape.dims()) {
+          case 0: {
+            output.device(d) = output.constant(input_tensor.scalar<T>()());
+          } break;
+          case 1: {
+            auto input = input_tensor.tensor<T, 1>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 2: {
+            auto input = input_tensor.tensor<T, 2>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 3: {
+            auto input = input_tensor.tensor<T, 3>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 4: {
+            auto input = input_tensor.tensor<T, 4>();
+            output.device(d) = input.reshape(reshape).broadcast(broadcast);
+          } break;
+          case 5: {
+            auto input = input_tensor.tensor<T, 5>();
+            output.device(d) = input.broadcast(broadcast);
+          } break;
+          default:
+            ctx->CtxFailure(errors::InvalidArgument(
+                "invalid shape to broadcast from ", input_shape.DebugString(),
+                " to ", output_shape.DebugString()));
+            break;
+        }
+      } break;
+      default:
+        ctx->CtxFailure(errors::InvalidArgument(
+            "invalid shape to broadcast from ", input_shape.DebugString(),
+            " to ", output_shape.DebugString()));
+        break;
+    }
+  }
+
+ private:
+  template <int NDIMS>
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPrefix(
+      const TensorShape &shape) const {
+    Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+    for (int d = 0; d < NDIMS - shape.dims(); d++) {
+      dsizes[d] = 1;
+    }
+    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
+      dsizes[d] = shape.dim_size(d - (NDIMS - shape.dims()));
+    }
+    return dsizes;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64595710853da0d6fc277c25f0700ab8fda6c526
--- /dev/null
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define INSTANTIATE_GPU_KERNEL(Type) \
+  template class functor::BroadcastTo<GPUDevice, Type>;
+TF_CALL_GPU_ALL_TYPES(INSTANTIATE_GPU_KERNEL);
+#undef INSTANTIATE_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de41bac723ce2e62258c521a34d4775426643bd
--- /dev/null
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -0,0 +1,266 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+class CollectiveOpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+
+  // A string encoding instance, frame and iter to be handed off to
+  // the implementation for use in generating RecvBuf keys.
+  string GetCollectiveKey(OpKernelContext* c) {
+    return strings::StrCat(col_params_.instance.instance_key, ":",
+                           c->frame_iter().frame_id, ":",
+                           c->frame_iter().iter_id);
+  }
+
+  // Returns false if calling invocation of ComputeAsync should return
+  // immediately.
+  bool CanProceedWithCompute(OpKernelContext* c, CollectiveExecutor* col_exec,
+                             const DoneCallback& done) {
+    if (col_params_.group.group_size >
+        col_params_.instance.device_names.size()) {
+      // This is the first invocation: Finish initializing col_params_.
+      // Call in a blockable thread because it's not guaranteed that
+      // this call cannot block.
+      c->env()->SchedClosure([this, c, done, col_exec]() {
+        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
+                                      c->cancellation_manager(),
+                                      [this, c, done](const Status& s) {
+                                        if (s.ok()) {
+                                          ComputeAsync(c, done);
+                                        } else {
+                                          c->SetStatus(s);
+                                          done();
+                                        }
+                                      });
+      });
+      return false;
+    }
+    return true;
+  }
+
+  CollectiveParams col_params_;
+};
+
+class CollectiveReduceOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("subdiv_offsets",
+                      &col_params_.instance.impl_details.subdiv_offsets));
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    OP_REQUIRES(c, merge_op_name == "Add" || merge_op_name == "Mul",
+                errors::InvalidArgument(
+                    "merge_op must be one of {\"Add\", \"Mul\"} but got ",
+                    merge_op_name));
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES(c, final_op_name == "Id" || final_op_name == "Div",
+                errors::InvalidArgument(
+                    "final_op must be one of {\"Id\", \"Div\"} but got ",
+                    final_op_name));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
+                                       merge_op_name, ",", final_op_name, ")");
+    col_params_.group.device_type = c->device_type();
+
+    // Find the OpKernels by name, type and device type.
+    NodeDef sub_node;
+    // The merge_op takes two inputs
+    sub_node.add_input(real_node.input(0));
+    sub_node.add_input(real_node.input(0));
+    sub_node.set_device(real_node.device());
+    SetAttrValue(col_params_.instance.data_type,
+                 &(*sub_node.mutable_attr())["T"]);
+    col_params_.merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
+    col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
+  }
+
+  std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
+                                          const string& name,
+                                          NodeDef* sub_node) {
+    std::unique_ptr<OpKernel> k;
+    if (name.empty() || name == "Id") return k;
+    sub_node->set_name(name);
+    sub_node->set_op(name);
+    Status status;
+    k = CreateOpKernel(c->device_type(), c->device(),
+                       c->device()->GetAllocator(AllocatorAttributes()),
+                       *sub_node, c->graph_def_version(), &status);
+    if (!status.ok()) {
+      c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
+                                                name, " : ",
+                                                status.error_message()));
+    }
+    return k;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->forward_input_or_allocate_output(
+                             {0}, 0, c->input(0).shape(), &output),
+                         done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
+                        CollectiveReduceOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_GPU),
+                        CollectiveReduceOpKernel);
+
+class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastSendOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = true;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    OP_REQUIRES_ASYNC(
+        c, shape_.IsSameSize(c->input(0).shape()),
+        errors::Internal("Declared shape of op ", col_params_.name,
+                         " does not match shape of input"),
+        done);
+    // Allocate the output Tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
+                        CollectiveBcastSendOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_GPU),
+                        CollectiveBcastSendOpKernel);
+
+class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastRecvOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = false;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // No input, so must allocate output.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
+                        CollectiveBcastRecvOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
+                        CollectiveBcastRecvOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index d8643c0b2fb2633f6b640b4f54dc2f8c92da654d..93e392d3032405ea848bd2f147653c9a5c7a1818 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -118,6 +118,7 @@ TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a8eff76c27109eb91389dffdca31380..a561d918bd36f711d1b813dfb533ec6d690af8ee 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -202,6 +202,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
 
@@ -209,6 +210,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
 
@@ -216,6 +218,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
 
@@ -223,6 +226,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index f16766315f2640ab7c42c077fc5156a3a825fbf9..a87b63f913c279d35f625b096bb7ac947cb9230b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -212,6 +212,7 @@ REGISTER_CONCAT(qint32);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_uint8(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index e3ba8ae9f691c8ec9be79952d7f97801552b2a56..39b44b2fcc8eb8336bfcf721919201536ed56133 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -78,6 +78,9 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 
+static void BM_ConcatDim1uint8(int iters, int dim2) {
+  ConcatHelper<uint8>(iters, 1, dim2);
+}
 static void BM_ConcatDim1int16(int iters, int dim2) {
   ConcatHelper<int16>(iters, 1, dim2);
 }
@@ -85,6 +88,7 @@ static void BM_ConcatDim1bfloat16(int iters, int dim2) {
   ConcatHelper<bfloat16>(iters, 1, dim2);
 }
 
+BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 312c1a41d36245ae3ca5a09d2e76a430bc464953..fe1a1ba5a306422d410a7b4646078b7b5e4c31eb 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -258,13 +258,15 @@ REGISTER_KERNEL(GPU, Eigen::half);
 REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, uint8);
 REGISTER_KERNEL(GPU, int8);
 REGISTER_KERNEL(GPU, uint16);
 REGISTER_KERNEL(GPU, int16);
 REGISTER_KERNEL(GPU, int64);
 REGISTER_KERNEL(GPU, bool);
-// Currently we do not support filling strings and complex64 on GPU
+// Currently we do not support filling strings on GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 96bdb6a241b1d88c7b14f22fc618ea9c95fb7642..8cadeac68d7907443d860e67b26bdedaf3634e5e 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -213,20 +214,29 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     // Perform best path decoding
     std::vector<std::vector<std::vector<int> > > sequences(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-      sequences[b].resize(1);
-      auto& sequence = sequences[b][0];
-      int prev_indices = -1;
-      for (int t = 0; t < seq_len_t(b); ++t) {
-        int max_class_indices;
-        log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
-        if (max_class_indices != blank_index &&
-            !(merge_repeated_ && max_class_indices == prev_indices)) {
-          sequence.push_back(max_class_indices);
+    auto decode = [&](const int64 begin, const int64 end) {
+      for (int b = begin; b < end; ++b) {
+        sequences[b].resize(1);
+        auto &sequence = sequences[b][0];
+        int prev_indices = -1;
+        for (int t = 0; t < seq_len_t(b); ++t) {
+          int max_class_indices;
+          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          if (max_class_indices != blank_index &&
+              !(merge_repeated_ && max_class_indices == prev_indices)) {
+            sequence.push_back(max_class_indices);
+          }
+          prev_indices = max_class_indices;
         }
-        prev_indices = max_class_indices;
       }
-    }
+    };
+
+    const int64 kCostPerUnit = 50 * max_time * num_classes;
+    const int64 total = batch_size;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, total,
+          kCostPerUnit, decode);
 
     OP_REQUIRES_OK(
         ctx, decode_helper_.StoreAllDecodedSequences(
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 07dc786d9b3aa9c0fbf9024a2edf844902c60b33..e4036ddaa9b25c0cc462f928e1e56e6dc12bfb8d 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -227,22 +227,43 @@ inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
                       s.error_message());
 }
 
-// A helper to allocate temporary scratch memory for Cudnn RNN models. It takes
-// the ownership of the underlying memory. The expectation is that the memory
-// should be alive for the span of the Cudnn RNN itself.
-class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
+template <typename>
+struct ToTFDataType;
+
+template <>
+struct ToTFDataType<Eigen::half> : std::integral_constant<DataType, DT_HALF> {};
+
+template <>
+struct ToTFDataType<float> : std::integral_constant<DataType, DT_FLOAT> {};
+
+template <>
+struct ToTFDataType<double> : std::integral_constant<DataType, DT_DOUBLE> {};
+
+template <>
+struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
+
+// A helper to allocate temporary scratch memory for Cudnn RNN models. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the Cudnn RNN itself.
+template <typename T>
+class CudnnRnnAllocatorInTemp : public ScratchAllocator {
  public:
-  ~CudnnRNNWorkspaceAllocator() override {}
-  explicit CudnnRNNWorkspaceAllocator(OpKernelContext* context)
+  ~CudnnRnnAllocatorInTemp() = default;
+
+  explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
+
   StatusOr<DeviceMemory<uint8>> AllocateBytes(
       perftools::gputools::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
+    const DataType tf_data_type = ToTFDataType<T>::value;
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
     Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
     if (!allocation_status.ok()) {
       return ToExecutorStatus(allocation_status);
     }
@@ -250,10 +271,16 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return StatusOr<DeviceMemory<uint8>>(
-        AsDeviceMemory<uint8>(&temporary_memory));
+    return DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory.template flat<T>().data(),
+        temporary_memory.template flat<T>().size() * sizeof(T));
+  }
+
+  int64 TotalByteSize() const { return total_byte_size_; }
+
+  Tensor get_allocated_tensor(int index) const {
+    return allocated_tensors_[index];
   }
-  int64 TotalByteSize() { return total_byte_size_; }
 
  private:
   int64 total_byte_size_ = 0;
@@ -261,15 +288,15 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
   std::vector<Tensor> allocated_tensors_;
 };
 
-// A helper to allocate reserve-space memory for Cudnn RNN models. The tensors
-// are allocated as a kernel output, and will be fed into the backward pass.
+// A helper to allocate memory for Cudnn RNN models as a kernel output. It is
+// used by forward pass kernel to feed the output to the backward pass.
 // The memory is expected to live long enough after the backward pass is
 // finished.
 template <typename T>
-class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
+class CudnnRnnAllocatorInOutput : public ScratchAllocator {
  public:
-  ~CudnnRNNReserveSpaceAllocator() override {}
-  CudnnRNNReserveSpaceAllocator(OpKernelContext* context, int output_index)
+  ~CudnnRnnAllocatorInOutput() override {}
+  CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
@@ -343,13 +370,14 @@ struct CudnnModelTypes {
   TFRNNInputMode rnn_input_mode;
   RnnDirectionMode rnn_direction_mode;
   bool HasInputC() const {
-    // For Cudnn 5.0, only LSTM has input-c. All other models use only input-h.
+    // For Cudnn 5.0, only LSTM has input-c. All other models use only
+    // input-h.
     return rnn_mode == RnnMode::kRnnLstm;
   }
 };
 
 // A helper class that collects the shapes to describe a RNN model.
-struct CudnnModelShapes {
+struct CudnnRnnModelShapes {
   int num_layers;
   int input_size;
   int num_units;
@@ -360,7 +388,7 @@ struct CudnnModelShapes {
   TensorShape output_shape;
   TensorShape hidden_state_shape;
   // At present only fields related to cached RnnDescriptor are concerned.
-  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+  bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count;
   }
@@ -371,9 +399,9 @@ struct CudnnModelShapes {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesHasher {
-  uint64 operator()(const CudnnModelShapes& to_hash) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesHasher {
+  uint64 operator()(const CudnnRnnModelShapes& to_hash) const {
     uint64 hash = static_cast<uint64>(to_hash.num_layers);
     hash = tensorflow::FingerprintCat64(
         hash, static_cast<uint64>(to_hash.input_size));
@@ -384,21 +412,21 @@ struct CudnnModelShapesHasher {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesComparator {
-  bool operator()(const CudnnModelShapes& first,
-                  const CudnnModelShapes& second) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesComparator {
+  bool operator()(const CudnnRnnModelShapes& first,
+                  const CudnnRnnModelShapes& second) const {
     return first.IsCompatibleWith(second);
   }
 };
 
-// Extract and checks the forward input tensors, parameters, and shapes from the
-// OpKernelContext.
+// Extract and checks the forward input tensors, parameters, and shapes from
+// the OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
                            const CudnnModelTypes& model_types,
                            const Tensor** input, const Tensor** input_h,
                            const Tensor** input_c, const Tensor** params,
-                           CudnnModelShapes* model_shapes) {
+                           CudnnRnnModelShapes* model_shapes) {
   TF_RETURN_IF_ERROR(context->input("input", input));
   TF_RETURN_IF_ERROR(context->input("input_h", input_h));
   if (model_types.HasInputC()) {
@@ -810,7 +838,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -876,7 +904,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the reserve_space. The memory lives in the
     // output of this kernel. And it will be fed into the backward pass when
     // needed.
-    CudnnRNNReserveSpaceAllocator<T> reserve_space_allocator(context, 3);
+    CudnnRnnAllocatorInOutput<T> reserve_space_allocator(context, 3);
     if (!is_training_) {
       Tensor* dummy_reserve_space = nullptr;
       OP_REQUIRES_OK(context,
@@ -884,7 +912,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     }
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -910,7 +938,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
                   input_c_data, params_data, *output_desc, &output_data,
                   *hidden_state_desc, &output_h_data, *hidden_state_desc,
                   &output_c_data, is_training_, &reserve_space_allocator,
-                  &workspace_allocator, /* output_result_profile */ nullptr)
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -920,8 +948,8 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  private:
   mutex mu_;
   bool is_training_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
@@ -949,7 +977,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -1090,7 +1118,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     auto reserve_space_uint8 = CastDeviceMemory<uint8, T>(reserve_space);
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -1119,7 +1147,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
                   output_c_backprop_data, &input_backprop_data,
                   &input_h_backprop_data, &input_c_backprop_data,
                   &params_backprop_data, &reserve_space_uint8,
-                  &workspace_allocator, /* output_result_profile */ nullptr)
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1128,8 +1156,8 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
  private:
   mutex mu_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd22f5777c27bba4a1694443206855380b226921
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -0,0 +1,224 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Basic coefficient-wise tenary operations.
+// This is the case for example of the clip_by_value.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::clip.
+template <typename Device, typename T>
+class ClipOp : public OpKernel {
+ public:
+  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+
+    auto in0_flat = in0.flat<T>();
+    auto in1_flat = in1.flat<T>();
+    auto in2_flat = in2.flat<T>();
+    const Device& d = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
+    if (in1.shape() == in2.shape()) {
+      if (in0.shape() == in1.shape()) {
+        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                            out_flat);
+      } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                          out_flat);
+      }
+    } else {
+      if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                               out_flat);
+      } else {
+        OP_REQUIRES(ctx, (in0.shape() == in2.shape() &&
+                          TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                                out_flat);
+      }
+    }
+  }
+};
+
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipFunc {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min_(value_min), value_max_(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max_), value_min_);
+  }
+  T value_min_;
+  T value_max_;
+};
+template <typename T>
+struct UnaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipFunc {
+  BinaryRightClipFunc(const T& value_min) : value_min_(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min_);
+  }
+  T value_min_;
+};
+template <typename T>
+struct BinaryRightClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipFunc {
+  BinaryLeftClipFunc(const T& value_max) : value_max_(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max_), value_min);
+  }
+  T value_max_;
+};
+template <typename T>
+struct BinaryLeftClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_CPU(T)                         \
+  template struct UnaryClipOp<CPUDevice, T>;       \
+  template struct BinaryRightClipOp<CPUDevice, T>; \
+  template struct BinaryLeftClipOp<CPUDevice, T>;  \
+  template struct TernaryClipOp<CPUDevice, T>;
+INSTANTIATE_CPU(Eigen::half);
+INSTANTIATE_CPU(float);
+INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(int8);
+INSTANTIATE_CPU(int16);
+INSTANTIATE_CPU(int32);
+INSTANTIATE_CPU(int64);
+INSTANTIATE_CPU(uint8);
+INSTANTIATE_CPU(uint16);
+#undef INSTANTIATE_CPU
+}  // namespace functor
+
+#define REGISTER_CPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ClipOp<CPUDevice, type>);
+
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(int8);
+REGISTER_CPU_KERNEL(int16);
+REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int64);
+REGISTER_CPU_KERNEL(uint8);
+REGISTER_CPU_KERNEL(uint16);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      ClipOp<GPUDevice, type>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int64);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(uint16);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("ClipByValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("t")
+                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_max")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ClipOp<CPUDevice, int32>);
+
+#undef REGISTER_GPU_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.h b/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a4bf8cf1d63736cce079fc7823c16d585007ca0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c0784754817bf4534b3137b9a1a39f02c287ec4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+template <typename T>
+__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
+                                      const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
+                                            const T *in1, const T *in2,
+                                            T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[i] < in0[i] ? in2[i] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
+                                           const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[i] ? in1[i] : value;
+  }
+}
+
+namespace functor {
+
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    UnaryClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryRightClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryLeftClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_GPU(T)                         \
+  template struct UnaryClipOp<GPUDevice, T>;       \
+  template struct BinaryRightClipOp<GPUDevice, T>; \
+  template struct BinaryLeftClipOp<GPUDevice, T>;  \
+  template struct TernaryClipOp<GPUDevice, T>;
+INSTANTIATE_GPU(Eigen::half);
+INSTANTIATE_GPU(float);
+INSTANTIATE_GPU(double);
+INSTANTIATE_GPU(int8);
+INSTANTIATE_GPU(int16);
+INSTANTIATE_GPU(int32);
+INSTANTIATE_GPU(int64);
+INSTANTIATE_GPU(uint8);
+INSTANTIATE_GPU(uint16);
+#undef INSTANTIATE_GPU
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 0c42f632521dd86760e791626c8978c0b1e82709..3eed847c16229f20df7495e0f17b4e5e35a64a8f 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -34,6 +34,19 @@ class DecodeCSVOp : public OpKernel {
                 errors::InvalidArgument("Out type too large"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("select_cols", &select_cols_));
+    OP_REQUIRES(
+        ctx, out_type_.size() == select_cols_.size() || select_cols_.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    select_all_cols_ = select_cols_.empty();
+    for (int i = 1; i < select_cols_.size(); i++) {
+      OP_REQUIRES(ctx, select_cols_[i - 1] < select_cols_[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols_.empty() || select_cols_.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
     delim_ = delim[0];
@@ -183,13 +196,18 @@ class DecodeCSVOp : public OpKernel {
 
  private:
   std::vector<DataType> out_type_;
+  std::vector<int64> select_cols_;
   char delim_;
   bool use_quote_delim_;
+  bool select_all_cols_;
   string na_value_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
     int64 current_idx = 0;
+    int64 num_fields_parsed = 0;
+    int64 selector_idx = 0;  // Keep track of index into select_cols
+
     if (!input.empty()) {
       while (static_cast<size_t>(current_idx) < input.size()) {
         if (input[current_idx] == '\n' || input[current_idx] == '\r') {
@@ -198,6 +216,10 @@ class DecodeCSVOp : public OpKernel {
         }
 
         bool quoted = false;
+        bool include =
+            (select_all_cols_ || select_cols_[selector_idx] ==
+                                     static_cast<size_t>(num_fields_parsed));
+
         if (use_quote_delim_ && input[current_idx] == '"') {
           quoted = true;
           current_idx++;
@@ -214,7 +236,7 @@ class DecodeCSVOp : public OpKernel {
                             input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
-            field += input[current_idx];
+            if (include) field += input[current_idx];
             current_idx++;
           }
 
@@ -226,14 +248,14 @@ class DecodeCSVOp : public OpKernel {
               (static_cast<size_t>(current_idx) < input.size() - 1) &&
               (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
             if (input[current_idx] != '"') {
-              field += input[current_idx];
+              if (include) field += input[current_idx];
               current_idx++;
             } else {
               OP_REQUIRES(
                   ctx, input[current_idx + 1] == '"',
                   errors::InvalidArgument("Quote inside a string has to be "
                                           "escaped by another quote"));
-              field += '"';
+              if (include) field += '"';
               current_idx += 2;
             }
           }
@@ -250,11 +272,20 @@ class DecodeCSVOp : public OpKernel {
           current_idx += 2;
         }
 
-        result->push_back(field);
+        num_fields_parsed++;
+        if (include) {
+          result->push_back(field);
+          selector_idx++;
+          if (selector_idx == select_cols_.size()) return;
+        }
       }
 
+      bool include =
+          (select_all_cols_ || select_cols_[selector_idx] ==
+                                   static_cast<size_t>(num_fields_parsed));
       // Check if the last field is missing
-      if (input[input.size() - 1] == delim_) result->push_back(string());
+      if (include && input[input.size() - 1] == delim_)
+        result->push_back(string());
     }
   }
 };
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4e5b776ed656a16cabde5e6823cf5c21efcaa59
--- /dev/null
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -0,0 +1,1011 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DecodeProto is a TensorFlow Op which extracts arbitrary fields
+// from protos serialized as strings.
+//
+// See docs in ../ops/decode_proto_op.cc.
+//
+// This implementation reads the serialized format using a handful of
+// calls from the WireFormatLite API used by generated proto code.
+// WireFormatLite is marked as an "internal" proto API but is widely
+// used in practice and highly unlikely to change.
+// This will be much faster than the previous implementation based on
+// constructing a temporary dynamic message in memory and using the
+// proto reflection api to read it.
+// It can be used with any proto whose descriptors are available at
+// runtime but should be competitive in speed with approaches that
+// compile in the proto definitions.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/decode.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::MakeUnique;
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::DynamicMessageFactory;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::Message;
+using ::tensorflow::protobuf::TextFormat;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedInputStream;
+
+const bool kFailOnDecodeError = true;
+
+// Returns true if the proto field type can be converted to the
+// tensorflow::DataType.
+bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return output_type == tensorflow::DT_FLOAT ||
+             output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_BOOL:
+      return output_type == tensorflow::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_GROUP:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_MESSAGE:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_BYTES:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_UINT32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_ENUM:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_SINT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SINT64:
+      return output_type == tensorflow::DT_INT64;
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// A FieldInfo holds a handful of information from the FieldDescriptor
+// and user attributes.
+struct FieldInfo {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index)
+      : output_index(user_index) {
+    // Without this intermediate data structure, the profile had hotspots
+    // calling methods of FieldDescriptor.
+    number = field_desc->number();
+
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    type = static_cast<WireFormatLite::FieldType>(field_desc->type());
+    is_repeated = field_desc->is_repeated();
+  }
+
+  // Disable copy and move.
+  FieldInfo(const FieldInfo&) = delete;
+  FieldInfo& operator=(const FieldInfo&) = delete;
+
+  // Internally we sort field descriptors by wire number for
+  // fast lookup. In general this is different from the order
+  // given by the user. Output_index gives the index into
+  // the field_names and output_types attributes and into
+  // the output tensor list.
+  int output_index = -1;
+
+  // This is a cache of the relevant fields from `FieldDescriptorProto`.
+  // This was added after noticing that FieldDescriptor->type() was
+  // using 6% of the cpu profile.
+  WireFormatLite::FieldType type;
+  int number;
+  bool is_repeated;
+};
+
+// A CountCollector counts sizes of repeated and optional fields in a proto.
+//
+// Each field is tracked by a single CountCollector instance. The
+// instance manages a single count, which is stored as a pointer (it
+// is intended to be a reference to the `sizes` output which is being
+// filled in). The pointer is passed in at initialization.
+//
+// Counting is done as a separate pass in order to allocate output tensors
+// all at once. This allows the TensorFlow runtime to optimize allocation
+// for the consumer, while removing the need for copying inside this op.
+// After this pass, the DenseCollector class (below) gathers the data:
+// It is more complex and provides better motivation for the API here.
+class CountCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  CountCollector() = default;
+
+  // The count may be stored inside an Eigen Tensor to eliminate copying.
+  explicit CountCollector(int32* count) : count_ptr_(count) {}
+
+  // Reads (in this case counts) a single value.
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // Only repeated fields can have count > 1.
+    if (*count_ptr_ == 0 || field.is_repeated) {
+      (*count_ptr_)++;
+    }
+    // We expect a wire type based on the schema field_type, to allow
+    // a little more checking.
+    if (!SkipValue(input, field)) {
+      return errors::DataLoss("ReadValue: Failed skipping field when counting");
+    }
+    return Status::OK();
+  }
+
+  // Reads (in this case counts) a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          size_t buf_size) {
+    if (buf_size == 0) {
+      return Status::OK();
+    }
+
+    const void* tmpbuf;
+    int unused_max_buf_size;
+
+    input->GetDirectBufferPointerInline(&tmpbuf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    const uint8* buf = reinterpret_cast<const uint8*>(tmpbuf);
+
+    // Important: we skipped the input->{Push,Pop}Limit() calls for speed,
+    // so the bounds check on buf_size inside Skip() is critical, and
+    // must be done before scanning the contents.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss("ReadPackedValues: Skipping packed field failed");
+    }
+
+    // Dispatch to the appropriately typed field reader based on the
+    // schema type.
+    Status st;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        st = CountPackedFixed<double>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FLOAT:
+        st = CountPackedFixed<float>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_UINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED64:
+        st = CountPackedFixed<uint64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED32:
+        st = CountPackedFixed<uint32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_BOOL:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_STRING:
+        st = errors::DataLoss("TYPE_STRING encountered as packed");
+        break;
+      case WireFormatLite::TYPE_GROUP:
+        st = errors::DataLoss("TYPE_GROUP encountered as packed");
+        break;
+      case WireFormatLite::TYPE_MESSAGE:
+        st = errors::DataLoss("TYPE_MESSAGE encountered as packed");
+        break;
+      case WireFormatLite::TYPE_BYTES:
+        st = errors::DataLoss("TYPE_BYTES encountered as packed");
+        break;
+      case WireFormatLite::TYPE_UINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_ENUM:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED32:
+        st = CountPackedFixed<int32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED64:
+        st = CountPackedFixed<int64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+        // default: intentionally omitted in order to enable static checking.
+    }
+    if (!st.ok()) {
+      return st;
+    }
+
+    if (!field.is_repeated && *count_ptr_ > 1) {
+      *count_ptr_ = 1;
+    }
+    return Status::OK();
+  }
+
+ private:
+  // Skips a length-delimited value.
+  static bool SkipBytes(CodedInputStream* input) {
+    uint32 length;
+    if (!input->ReadVarint32(&length)) {
+      return false;
+    }
+    return input->Skip(length);
+  }
+
+  // Counts the number of packed varints in an array.
+  // The end of a varint is signaled by a value < 0x80,
+  // so counting them requires parsing the bytestream.
+  // It is the caller's responsibility to ensure that len > 0.
+  Status CountPackedVarint(const uint8* buf, size_t len) {
+    const uint8* bound = buf + len;
+    int count;
+
+    // The last byte in a valid encoded varint is guaranteed to have
+    // the high bit unset. We rely on this property to prevent
+    // ReadVarint64FromArray from going out of bounds, so validate
+    // the end of the buf before scanning anything.
+    if (bound[-1] & 0x80) {
+      return errors::DataLoss("Corrupt packed varint");
+    }
+
+    // Now we can trust ReadVarint64FromArray to stay in bounds.
+    for (count = 0; buf < bound; ++count) {
+      uint64 temp;
+      bool ok;
+      buf = internal::ReadVarint64FromArray(buf, &ok, &temp);
+      if (!ok) {
+        return errors::DataLoss("Corrupt packed varint");
+      }
+    }
+
+    *count_ptr_ += count;
+    return Status::OK();
+  }
+
+  // Counts the number of fixed-size values in a packed field.
+  // This can be done without actually parsing anything.
+  template <typename T>
+  Status CountPackedFixed(const uint8* unused_buf, size_t len) {
+    int count = len / sizeof(T);
+    if (count * sizeof(T) != len) {
+      return errors::DataLoss(
+          "Illegal data length for packed fixed-size type: ", len);
+    }
+    *count_ptr_ += len / sizeof(T);
+    return Status::OK();
+  }
+
+  // Skips a single value in the input stream.
+  // Dispatches to the appropriately typed field skipper based on the
+  // schema type tag.
+  // This is not as permissive as just handling the wire type.
+  static bool SkipValue(CodedInputStream* input, const FieldInfo& field) {
+    uint32 tmp32;
+    protobuf_uint64 tmp64;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FLOAT:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_INT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_UINT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_INT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_FIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_BOOL:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_STRING:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_GROUP:
+        return WireFormatLite::SkipField(
+            input, WireFormatLite::MakeTag(
+                       field.number, WireFormatLite::WIRETYPE_START_GROUP));
+      case WireFormatLite::TYPE_MESSAGE:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_BYTES:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_UINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_ENUM:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_SINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SINT64:
+        return input->ReadVarint64(&tmp64);
+        // default: intentionally omitted in order to enable static checking.
+    }
+  }
+
+  int32* count_ptr_ = nullptr;
+};
+
+// A DenseCollector accumulates values from a proto into a tensor.
+//
+// There is an instance of DenseCollector for each field of each
+// proto. The DenseCollector deserializes the value from the wire
+// directly into the preallocated output Tensor.
+//
+// This class is named DenseCollector because in the future there should
+// be a SparseCollector that accumulates field data into sparse tensors if
+// the user requests it.
+class DenseCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  DenseCollector() = default;
+
+  // A DenseCollector applies to one field of a serialized message.
+  DenseCollector(uint8* datap, DataType dtype, int max_repeat_count)
+      : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {}
+
+  // Reads a value from the input stream and stores it.
+  //
+  // Always inlining gave a ~50% speedup on microbenchmarks at one point.
+  // TODO(nix): try removing it to see if that still holds.
+  // TODO(jsimsa): ABSL_ATTRIBUTE_ALWAYS_INLINE
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // For required and optional fields, we overwrite values[0] with
+    // the latest one in the wire stream.
+    // See https://developers.google.com/protocol-buffers/docs/encoding#optional
+    // Only for repeated fields do we advance the next_repeat_index_ past 1.
+    // TODO(nix): to handle oneof we must also zero out any previous values
+    //  seen on the wire.
+    int32 index = 0;
+    if (field.is_repeated) {
+      index = next_repeat_index_;
+    }
+    next_repeat_index_ = index + 1;
+
+    return internal::ReadValue(input, field.type, field.number, dtype_, index,
+                               datap_);
+  }
+
+  // Reads and stores a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          const size_t buf_size) {
+    const void* buf;
+    int unused_max_buf_size;
+    input->GetDirectBufferPointerInline(&buf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss(
+          "ReadPackedValues: Skipping packed field failed.  Field tag: ",
+          field.number);
+    }
+
+    // Setting stride=0 causes new values to overwrite old ones for
+    // non-repeated fields.
+    const int stride = field.is_repeated ? 1 : 0;
+
+    if (next_repeat_index_ >= max_repeat_count_) {
+      return errors::DataLoss(
+          "ReadPackedValues: Tried to write more entries than allowed.  "
+          "Field tag: ",
+          field.number, ", Max entries allowed: ", max_repeat_count_);
+    } else {
+      return internal::ReadPackedFromArray(buf, buf_size, field.type,
+                                           field.number, dtype_, stride,
+                                           &next_repeat_index_, datap_);
+    }
+  }
+
+  // Fills in any missing values in the output array with defaults.
+  // Dispatches to the appropriately typed field default based on the
+  // runtime type tag.
+  Status FillWithDefaults() {
+    switch (dtype_) {
+      case DataType::DT_FLOAT:
+        return FillDefault<float>();
+      case DataType::DT_DOUBLE:
+        return FillDefault<double>();
+      case DataType::DT_INT32:
+        return FillDefault<int32>();
+      case DataType::DT_UINT8:
+        return FillDefault<uint8>();
+      case DataType::DT_INT8:
+        return FillDefault<int8>();
+      case DataType::DT_STRING:
+        return FillDefault<string>();
+      case DataType::DT_INT64:
+        return FillDefault<int64>();
+      case DataType::DT_BOOL:
+        return FillDefault<bool>();
+      default:
+        // There are many tensorflow dtypes not handled here, but they
+        // should not come up unless type casting is added to the Op.
+        // Chaining with tf.cast() should do the right thing until then.
+        return errors::DataLoss(
+            "Failed filling defaults in unknown tf::DataType");
+    }
+  }
+
+ private:
+  // Fills empty values in the dense representation with a
+  // default value. This uses next_repeat_index_ which counts the number
+  // of parsed values for the field.
+  template <class T>
+  Status FillDefault() {
+    for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
+      reinterpret_cast<T*>(datap_)[i] = T();
+    }
+    return Status::OK();
+  }
+
+  int32 next_repeat_index_ = 0;
+
+  // This is a pointer to data_[message_index_].
+  // There is no bounds checking at this level: we computed the max
+  // repeat size for each field in CountCollector and use the same
+  // code to traverse it here, so we are guaranteed not to be called
+  // for more items than we have allocated space.
+  void* const datap_ = nullptr;
+
+  const DataType dtype_ = DataType::DT_INVALID;
+  const int max_repeat_count_ = 0;
+};
+
+class DecodeProtoOp : public OpKernel {
+ public:
+  explicit DecodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    std::vector<string> field_names;
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names));
+    std::vector<DataType> output_types;
+    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_types));
+    OP_REQUIRES(
+        context, field_names.size() == output_types.size(),
+        errors::InvalidArgument("field_names and output_types attributes must "
+                                "have the same length"));
+
+    // Gather the field descriptors and check that requested output types match.
+
+    int field_index = 0;
+    std::vector<const FieldDescriptor*> field_descs;
+    for (const string& name : field_names) {
+      auto fd = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, fd != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+      OP_REQUIRES(context,
+                  CheckOutputType(fd->type(), output_types[field_index]),
+                  // Many TensorFlow types don't have corresponding proto types
+                  // and the user will get an error if they are requested. It
+                  // would be nice to allow conversions here, but tf.cast
+                  // already exists so we don't duplicate the functionality.
+                  // Known unhandled types:
+                  //   DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+                  //   DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+                  errors::InvalidArgument("Unexpected output type for ",
+                                          fd->full_name(), ": ", fd->cpp_type(),
+                                          " to ", output_types[field_index]));
+
+      field_index++;
+      field_descs.push_back(fd);
+    }
+
+    // Internally we want the field_descs sorted by their number on the wire.
+    // But the output tensors are allocated in the order given by the caller.
+    // Build a mapping i->j, where field_descs[i] corresponds to outputs[j].
+    std::vector<int> output_indices;
+    output_indices.reserve(field_names.size());
+    for (int i = 0; i < field_names.size(); i++) {
+      output_indices.push_back(i);
+    }
+    std::sort(output_indices.begin(), output_indices.end(),
+              [field_descs](int a, int b) {
+                return field_descs[a]->number() < field_descs[b]->number();
+              });
+
+    // Now store the fields in sorted order.
+    for (int i = 0; i < field_names.size(); i++) {
+      fields_.push_back(MakeUnique<FieldInfo>(field_descs[output_indices[i]],
+                                              output_indices[i]));
+    }
+
+    message_prototype_ = message_factory_.GetPrototype(message_desc);
+    OP_REQUIRES(context, message_prototype_ != nullptr,
+                errors::InvalidArgument("Couldn't get prototype message: ",
+                                        message_desc->full_name()));
+    string format;
+    OP_REQUIRES_OK(context, context->GetAttr("message_format", &format));
+    OP_REQUIRES(
+        context, format == "binary" || format == "text",
+        errors::InvalidArgument("format must be one of binary or text"));
+    is_binary_ = format == "binary";
+
+    // Enable the initial protobuf sanitizer, which is much
+    // more expensive than the decoder.
+    // TODO(nix): Remove this once the fast decoder
+    // has passed security review.
+    OP_REQUIRES_OK(context, context->GetAttr("sanitize", &sanitize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& buf_tensor = ctx->input(0);
+    int message_count = buf_tensor.NumElements();
+    OP_REQUIRES(ctx, message_count >= 1,
+                errors::InvalidArgument(
+                    "Bufs argument must contain at least one value"));
+
+    int field_count = fields_.size();
+
+    // Save the argument shape for later, then flatten the input
+    // Tensor since we are working componentwise. We will restore
+    // the same shape in the returned Tensor.
+    const TensorShape& shape_prefix = buf_tensor.shape();
+
+    TensorShape sizes_shape = shape_prefix;
+    sizes_shape.AddDim(field_count);
+    Tensor* sizes_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, sizes_shape, &sizes_tensor));
+
+    // This is used to allocate binary bufs if used. It serves only
+    // to define memory ownership.
+    std::vector<string> tmp_binary_bufs(message_count);
+
+    // These are the actual buffers to use, which may be in tmp_binary_bufs
+    // or may be pointers into the buf_tensor. Either way they are not owned
+    // here.
+    std::vector<const string*> bufs;
+
+    if (is_binary_ && !sanitize_) {
+      // Fast path.
+      for (int mi = 0; mi < message_count; ++mi) {
+        const string* buf = &buf_tensor.flat<string>()(mi);
+        bufs.push_back(buf);
+      }
+    } else {
+      // We will have to allocate a copy, either to convert from text to
+      // binary or to sanitize a binary proto.
+      for (int mi = 0; mi < message_count; ++mi) {
+        ReserializeMessage(ctx, buf_tensor.flat<string>()(mi),
+                           &tmp_binary_bufs[mi]);
+        if (!ctx->status().ok()) {
+          return;
+        }
+        bufs.push_back(&tmp_binary_bufs[mi]);
+      }
+    }
+
+    // Walk through all the strings in the input tensor, counting
+    // the number of fields in each.
+    // We can't allocate our actual output Tensor until we know the
+    // maximum repeat count, so we do a first pass through the serialized
+    // proto just counting fields.
+    // We always allocate at least one value so that optional fields
+    // are populated with default values - this avoids a TF
+    // conditional when handling the output data.
+    // The caller can distinguish between real data and defaults
+    // using the repeat count matrix that is returned by decode_proto.
+    std::vector<int32> max_sizes(field_count, 1);
+    for (int mi = 0; mi < message_count; ++mi) {
+      CountFields(ctx, mi, *bufs[mi], sizes_tensor, &max_sizes);
+      if (!ctx->status().ok()) {
+        return;
+      }
+    }
+
+    // Allocate the output tensors now that we've seen the max size.
+    // TODO(nix): Use allocate_output_or_forward_input for the largest
+    //   output tensor. This can avoid one large allocation by re-using
+    //   the memory of the input tensor.
+    std::vector<Tensor*> outputs(field_count);
+    for (int fi = 0; fi < field_count; ++fi) {
+      TensorShape flat_shape = {static_cast<int64>(message_count),
+                                max_sizes[fi]};
+      TensorShape out_shape = shape_prefix;
+      out_shape.AddDim(max_sizes[fi]);
+
+      // Surprisingly we don't specify the types from the output_types
+      // attribute: that is done for us based on the Op declaration:
+      //  REGISTER_OP(...)
+      //    .Attr("output_types: list(type) >= 0")
+      //    .Output("values: output_types")
+      OP_REQUIRES_OK(ctx,
+                     // ctx->allocate_output(output_indices_[fi] + 1,
+                     ctx->allocate_output(fields_[fi]->output_index + 1,
+                                          out_shape, &outputs[fi]));
+    }
+
+    // Make the second pass through the serialized proto, decoding
+    // into preallocated tensors.
+    AccumulateFields(ctx, bufs, outputs);
+  }
+
+ private:
+  // Copy a serialized message to binary, e.g. to handle text proto inputs.
+  void ReserializeMessage(OpKernelContext* ctx, const string& buf,
+                          string* binary_buf) {
+    // Handle text protos by translating them to binary.
+    std::unique_ptr<Message> message(message_prototype_->New());
+    OP_REQUIRES(ctx, message, errors::DataLoss("Initializing message failed"));
+
+    if (is_binary_) {
+      // If we get here we are sanitizing the input protobuf by parsing
+      // and reserializing it with a trusted (but very slow) library.
+      OP_REQUIRES(ctx, message->ParseFromString(buf),
+                  errors::DataLoss("Unable to parse binary protobuf"));
+    } else {
+      OP_REQUIRES(ctx, TextFormat::ParseFromString(buf, message.get()),
+                  errors::DataLoss("Unable to parse text protobuf"));
+    }
+
+    OP_REQUIRES(ctx, message->SerializeToString(binary_buf),
+                errors::DataLoss("Unable to reserialize text proto as binary"));
+  }
+
+  // Count the number of occurrences of each requested field in a message batch.
+  void CountFields(OpKernelContext* ctx, int message_index, const string& buf,
+                   Tensor* sizes_tensor, std::vector<int32>* max_sizes) {
+    int field_count = fields_.size();
+
+    CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                           buf.size());
+
+    std::vector<int32> field_sizes(field_count, 0);
+    std::vector<CountCollector> counters;
+    counters.reserve(field_count);
+    for (int i = 0; i < field_count; i++) {
+      counters.emplace_back(&field_sizes[i]);
+    }
+
+    Status st = Collect(&input, &counters);
+    if (st.ok() && !input.ConsumedEntireMessage()) {
+      st = errors::DataLoss("CountFields: Failed to consume entire buffer");
+    }
+    if (kFailOnDecodeError) {
+      OP_REQUIRES_OK(ctx, st);  // NOLINT
+    }
+    if (!st.ok()) {
+      // This code suppresses the corrupt proto, treating it as empty
+      // to avoid crashing the process.
+      LOG(WARNING) << "Proto counting error for message type " << message_type_
+                   << ": " << st;
+
+      for (int fi = 0; fi < field_count; fi++) {
+        field_sizes[fi] = 0;
+      }
+      // Finished decoding this message.
+      return;
+    }
+
+    // Update the size tensor and max repeat size for each field.
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+    for (int fi = 0; fi < field_count; fi++) {
+      int32 size = field_sizes[fi];
+      sizes(message_index, fields_[fi]->output_index) = size;
+      if ((*max_sizes)[fi] < size) {
+        (*max_sizes)[fi] = size;
+      }
+    }
+  }
+
+  // Parse fields from a serialized message into preallocated tensors.
+  void AccumulateFields(OpKernelContext* ctx,
+                        const std::vector<const string*>& bufs,
+                        std::vector<Tensor*> outputs) {
+    struct TensorInfo {
+      explicit TensorInfo(Tensor* tensor) {
+        // Note that we can decode only max_repeat_count values before overflow.
+        // No other bounds checking is done for repeated fields. For
+        // optional fields there is a check to make sure that only the last
+        // value on the wire appears in the output tensor.
+        dtype = tensor->dtype();
+        last_dim_size = tensor->dim_size(tensor->dims() - 1);
+
+        if (dtype != DT_STRING) {
+          const int element_size = DataTypeSize(dtype);
+          CHECK_GT(element_size, 0);
+          stride = last_dim_size * element_size;
+
+          const int64 flatshape[1] = {tensor->NumElements() * element_size};
+          data = tensor->bit_casted_shaped<uint8, 1>(flatshape).data();
+        } else {
+          // DataTypeSize() returns 0 for string types.
+          stride = last_dim_size * sizeof(string);
+          data = reinterpret_cast<uint8*>(tensor->flat<string>().data());
+        }
+      }
+
+      DataType dtype;
+      int last_dim_size;
+      int stride;
+      uint8* data;
+    };
+
+    int field_count = fields_.size();
+
+    std::vector<TensorInfo> tensors;
+    tensors.reserve(field_count);
+    for (int fi = 0; fi < field_count; fi++) {
+      tensors.emplace_back(outputs[fi]);
+    }
+
+    for (int message_index = 0; message_index < bufs.size(); ++message_index) {
+      const string& buf = *bufs[message_index];
+
+      std::vector<DenseCollector> collectors;
+      collectors.reserve(field_count);
+      for (const TensorInfo& info : tensors) {
+        collectors.emplace_back(info.data + message_index * info.stride,
+                                info.dtype, info.last_dim_size);
+      }
+
+      // Fill in output tensors from the wire.
+      CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                             buf.size());
+      Status st = Collect(&input, &collectors);
+      if (st.ok() && !input.ConsumedEntireMessage()) {
+        st = errors::DataLoss(
+            "AccumulateFields: Failed to consume entire buffer");
+      }
+      if (kFailOnDecodeError) {
+        OP_REQUIRES_OK(ctx, st);  // NOLINT
+      }
+      if (!st.ok()) {
+        // This code suppresses the corrupt proto, treating it as empty
+        // to avoid crashing training.
+        LOG(WARNING) << "Proto counting error for message type "
+                     << message_type_ << ": " << st;
+      }
+
+      // Fill the remainder of the dense outputs with default values.
+      for (auto& collector : collectors) {
+        OP_REQUIRES_OK(ctx, collector.FillWithDefaults());
+      }
+    }
+  }
+
+  // Look up the FieldDescriptor for a particular field number.
+  bool LookupField(int field_number, int* field_index) {
+    // Look up the FieldDescriptor using linear search.
+    // TODO(nix): this could be sped up with binary search, but we are
+    // already way off the fastpath at this point. If you see a hotspot
+    // here, somebody is sending you very inefficient protos.
+    for (int fi = fields_.size() - 1; fi >= 0; fi--) {
+      if (field_number == fields_[fi]->number) {
+        *field_index = fi;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Traverses a serialized protobuf, dispatching values to the collectors.
+  template <class CollectorClass>
+  Status Collect(CodedInputStream* input,
+                 std::vector<CollectorClass>* collectors) {
+    int last_good_field_index = -1;
+    bool fields_disordered = false;
+    int prev_field_number = -1;
+    int field_number = -1;
+    int last_good_field_number = -1;
+    int next_good_field_number = fields_[0]->number;
+
+    // The 'tag' variable should always be treated as tainted.
+    for (uint32 tag = input->ReadTag();
+         tag != 0 && WireFormatLite::GetTagWireType(tag) !=
+                         WireFormatLite::WIRETYPE_END_GROUP;
+         tag = input->ReadTag(), prev_field_number = field_number) {
+      field_number = WireFormatLite::GetTagFieldNumber(tag);
+      const FieldInfo* field = nullptr;
+
+      // This takes advantage of the sorted field numbers in most serialized
+      // protos: it tries the next expected field first rather than doing
+      // a lookup by field number.
+      // TODO(nix): haberman@ suggests a hybrid approach with a lookup table
+      // for small field numbers and a hash table for larger ones. This would
+      // be a simpler approach that should offer comparable speed in most
+      // cases.
+      if (field_number == last_good_field_number) {
+        field = fields_[last_good_field_index].get();
+      } else {
+        if (field_number < prev_field_number) {
+          fields_disordered = true;
+        }
+
+        // If fields are out of order, fall back to slow lookup.
+        if (fields_disordered) {
+          int field_index;
+          if (LookupField(field_number, &field_index)) {
+            field = fields_[field_index].get();
+            last_good_field_index = field_index;
+          }
+        } else {
+          // If we see a field that is past the next field we want,
+          // it was empty. Look for the one after that.
+          // Repeat until we run out of fields that we care about.
+          while (field_number >= next_good_field_number) {
+            if (field_number == next_good_field_number) {
+              last_good_field_number = field_number;
+              field = fields_[last_good_field_index + 1].get();
+            }
+
+            // Start looking for the field after the current one.
+            ++last_good_field_index;
+            if (last_good_field_index < fields_.size() - 1) {
+              next_good_field_number =
+                  fields_[last_good_field_index + 1]->number;
+            } else {
+              // Saw something past the last field we care about.
+              // Continue parsing the message just in case there
+              // are disordered fields later, but any remaining
+              // ordered fields will have no effect.
+              next_good_field_number = INT_MAX;
+            }
+          }
+        }
+      }
+
+      if (!field) {
+        // Unknown and unrequested fields are skipped.
+        if (!WireFormatLite::SkipField(input, tag)) {
+          return errors::DataLoss("Failed skipping unrequested field");
+        }
+        continue;
+      }
+
+      Status st = CollectField(*field, WireFormatLite::GetTagWireType(tag),
+                               input, &(*collectors)[last_good_field_index]);
+      if (!st.ok()) {
+        return st;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Collects values for a single field.
+  template <class CollectorClass>
+  Status CollectField(const FieldInfo& field,
+                      WireFormatLite::WireType wire_type,
+                      CodedInputStream* input, CollectorClass* collector) {
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    WireFormatLite::WireType schema_wire_type =
+        WireFormatLite::WireTypeForFieldType(field.type);
+
+    // Handle packed repeated fields. SkipField would skip the
+    // whole length-delimited blob without letting us count the
+    // values, so we have to scan them ourselves.
+    if (wire_type == WireFormatLite::WIRETYPE_LENGTH_DELIMITED &&
+        schema_wire_type != WireFormatLite::WIRETYPE_LENGTH_DELIMITED) {
+      // Handle packed repeated primitives.
+      int length;
+      if (!input->ReadVarintSizeAsInt(&length)) {
+        return errors::DataLoss("CollectField: Failed reading packed size");
+      }
+      Status st = collector->ReadPackedValues(input, field, length);
+      if (!st.ok()) {
+        return st;
+      }
+      return Status::OK();
+    }
+
+    // Read ordinary values, including strings, bytes, and messages.
+    if (wire_type != schema_wire_type) {
+      if (!WireFormatLite::SkipField(
+              input, WireFormatLite::MakeTag(field.number, wire_type))) {
+        return errors::DataLoss(
+            "CollectField: Failed skipping malformed field");
+      }
+      return Status::OK();
+    }
+    return collector->ReadValue(input, field);
+  }
+
+  string message_type_;
+  // Note that fields are sorted by increasing field number,
+  // which is not in general the order given by the user-specified
+  // field_names and output_types Op attributes.
+  std::vector<std::unique_ptr<const FieldInfo>> fields_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+  DynamicMessageFactory message_factory_;
+  const Message* message_prototype_;
+
+  // True if decoding binary format, false if decoding text format.
+  bool is_binary_;
+
+  // True if the protos should be sanitized before parsing.
+  // Enables the initial protobuf sanitizer, which is much
+  // more expensive than the decoder. The flag defaults to true
+  // but can be set to false for trusted sources.
+  // TODO(nix): flip the default to false when the fast decoder
+  // has passed security review.
+  bool sanitize_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DecodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeProtoV2").Device(DEVICE_CPU),
+                        DecodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index a878fe9a97059e2d55164600923c4a2e1312161b..3ed3794e01d63d49e5be0406e3f892bfbec2c8c8 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/dense_update_functor.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -70,4 +71,59 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
 
 }  // namespace functor
 
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(DEVICE, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<DEVICE>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 4aefe26c545ee5eaf3868b73cd9ace38fd135f53..240c13261eaf1da256a326329c8eb72cce2cbcab 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -19,11 +19,14 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -89,6 +92,17 @@ struct DenseUpdate<SYCLDevice, T, ASSIGN> {
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
+
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+template <>
+Status VariantCopyFn<CPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+template <>
+Status VariantCopyFn<GPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b02ae52a23aeabe55e6233e34b15cffb2073ded
--- /dev/null
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -0,0 +1,591 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// EncodeProto is a TensorFlow Op which serializes tensors into
+// arbitrary protobufs.
+//
+// See the docstring in ../ops/encode_proto_op.cc for usage of the op.
+//
+// This implementation writes the serialized format using a handful of
+// calls from the WireFormatLite API.
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedOutputStream;
+using ::tensorflow::protobuf::io::StringOutputStream;
+
+// Computes the total serialized size for a packed repeated field.
+// For fixed-size types this can just multiply, but for variable-sized
+// types it has to iterate through the values in the tensor.
+template <WireFormatLite::FieldType FieldType, typename TensorT>
+size_t TotalPackedSize(const Tensor& input, int message_index, int size);
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_DOUBLE, double>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kDoubleSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, double>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, float>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT64, int64>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT32, int32>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED64, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int32>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_BOOL, bool>(const Tensor& input,
+                                                        int message_index,
+                                                        int size) {
+  return size * WireFormatLite::kBoolSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_ENUM, int32>(const Tensor& input,
+                                                         int message_index,
+                                                         int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size +=
+        WireFormatLite::EnumSize(input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED32, int32>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED64, int64>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+// Writes a possibly repeated primitive field.
+// TensorFlow does not have unsigned types, so we decode them to signed and
+// encode them back to unsigned.
+template <typename TensorT, typename ProtoT,
+          WireFormatLite::FieldType FieldType,
+          void Writer(ProtoT, CodedOutputStream*)>
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto wire_type = WireFormatLite::WireTypeForFieldType(
+      WireFormatLite::FieldType(field_desc.type()));
+
+  auto input_t = input.flat_inner_dims<TensorT>();
+  if (field_desc.options().packed()) {
+    // Write the tag for the packed field.
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+
+    // Write the total packed length.
+    size_t data_size =
+        TotalPackedSize<FieldType, TensorT>(input, message_index, size);
+    output->WriteVarint32(data_size);
+
+    // Write individual values.
+    for (int64 i = 0; i < size; i++) {
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  } else {
+    for (int64 i = 0; i < size; i++) {
+      WireFormatLite::WriteTag(field_desc.number(), wire_type, output);
+
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  }
+}
+
+// Writes a possibly repeated string, bytes, or message field.
+template <typename T, void Writer(int, const T&, CodedOutputStream*)>
+void WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
+                      int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<T>();
+  for (int64 i = 0; i < size; i++) {
+    const T& value = input_t(static_cast<int64>(message_index), i);
+    // TODO(nix): there doesn't seem to be an inlined version of
+    // WireFormatLite::WriteString or its relatives, which might allow a
+    // small speedup.
+    Writer(field_desc.number(), value, output);
+  }
+}
+
+// Writes a group field.
+// Groups are treated like submessages, but tag-delimited
+// instead of length-delimited. WireFormatLite handles this
+// differently so we code it ourselves.
+void WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<string>();
+  for (int64 i = 0; i < size; i++) {
+    const string& value = input_t(static_cast<int64>(message_index), i);
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_START_GROUP, output);
+    // Note the use of WriteRaw instead of WriteString to skip the length.
+    output->WriteRaw(value.data(), value.size());
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_END_GROUP, output);
+  }
+}
+
+// Writes a (possibly repeated) field into an output stream.
+// It is the caller's responsibility to ensure that the type of
+// the input tensor is compatible with the type of the proto
+// field descriptor, and that (message_index, size-1) is within
+// bounds.
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  DataType tf_type = input.dtype();
+
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return WriteField<double, double, WireFormatLite::TYPE_DOUBLE,
+                        WireFormatLite::WriteDoubleNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FLOAT:
+      switch (tf_type) {
+        case DataType::DT_FLOAT:
+          return WriteField<float, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_DOUBLE:
+          return WriteField<double, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_INT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_INT64,
+                        WireFormatLite::WriteInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_UINT64,
+                        WireFormatLite::WriteUInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_INT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_INT32,
+                        WireFormatLite::WriteInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_FIXED64,
+                        WireFormatLite::WriteFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_BOOL:
+      return WriteField<bool, bool, WireFormatLite::TYPE_BOOL,
+                        WireFormatLite::WriteBoolNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_STRING:
+      return WriteVarLenField<string, WireFormatLite::WriteString>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_GROUP:
+      return WriteGroup(field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_MESSAGE:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_BYTES:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_ENUM:
+      return WriteField<int32, int32, WireFormatLite::TYPE_ENUM,
+                        WireFormatLite::WriteEnumNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SFIXED32,
+                        WireFormatLite::WriteSFixed32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SFIXED64,
+                        WireFormatLite::WriteSFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SINT32,
+                        WireFormatLite::WriteSInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SINT64,
+                        WireFormatLite::WriteSInt64NoTag>(
+          field_desc, input, message_index, size, output);
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// Checks that a Protobuf field is compatible with a TensorFlow datatype.
+// This is separated from WriteField to lift it out of the inner loop.
+bool IsCompatibleType(const FieldDescriptor& field_desc, DataType tf_type) {
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return tf_type == DataType::DT_FLOAT || tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SFIXED64:
+    case WireFormatLite::TYPE_SINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_SFIXED32:
+    case WireFormatLite::TYPE_SINT32:
+      return tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return tf_type == DataType::DT_INT64 || tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_BOOL:
+      return tf_type == DataType::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return tf_type == DataType::DT_STRING;
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return false;
+}
+
+class EncodeProtoOp : public OpKernel {
+ public:
+  explicit EncodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names_));
+
+    // Gather the field descriptors for the given field_names.
+    field_descs_.resize(field_names_.size());
+    for (int i = 0; i < field_names_.size(); i++) {
+      const string& name = field_names_[i];
+      auto field_desc = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, field_desc != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+
+      field_descs_[i] = field_desc;
+    }
+
+    // Build a list of indices into field_descs sorted by increasing
+    // field_number. This will be used to output fields in sorted order,
+    // which is strongly encouraged when serializing protobufs.
+    sorted_field_index_.resize(field_names_.size());
+    // Start with the fields sorted by current index.
+    for (int i = 0; i < field_names_.size(); i++) sorted_field_index_[i] = i;
+    // Then sort the field indices by their proto field number.
+    std::sort(sorted_field_index_.begin(), sorted_field_index_.end(),
+              [this](int a, int b) -> bool {
+                return field_descs_[a]->number() < field_descs_[b]->number();
+              });
+  }
+
+  void Compute(OpKernelContext* cx) override {
+    const Tensor* sizes_tensor;
+    OP_REQUIRES_OK(cx, cx->input("sizes", &sizes_tensor));
+
+    OpInputList values;
+    OP_REQUIRES_OK(cx, cx->input_list("values", &values));
+
+    OP_REQUIRES(cx, field_descs_.size() == values.size(),
+                errors::InvalidArgument(
+                    "Length of inputs list must match field_names"));
+
+    // Check the arguments for consistency.
+    TensorShape common_prefix;
+    int message_count;
+    for (int i = 0; i < field_descs_.size(); i++) {
+      const Tensor& v = values[i];
+
+      // The type of each value tensor must match the corresponding field.
+      OP_REQUIRES(cx, IsCompatibleType(*field_descs_[i], v.dtype()),
+                  errors::InvalidArgument(
+                      "Incompatible type for field " + field_names_[i] +
+                          ".  Saw dtype: ",
+                      DataTypeString(v.dtype()),
+                      " but field type is: ", field_descs_[i]->type_name()));
+
+      // All value tensors must have the same shape prefix (i.e. batch size).
+      TensorShape shape_prefix = v.shape();
+      shape_prefix.RemoveDim(shape_prefix.dims() - 1);
+
+      // Do some initialization on the first input value. The rest will
+      // have to match this one.
+      if (i == 0) {
+        OP_REQUIRES(cx, v.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Expected value to be at least a vector, saw shape: ",
+                        v.shape().DebugString()));
+        common_prefix = shape_prefix;
+        message_count = common_prefix.num_elements();
+      } else {
+        OP_REQUIRES(cx, shape_prefix == common_prefix,
+                    errors::InvalidArgument(
+                        "Values must match up to the last dimension"));
+      }
+    }
+
+    TensorShape expected_sizes_shape = common_prefix;
+    expected_sizes_shape.AddDim(field_descs_.size());
+
+    OP_REQUIRES(cx, sizes_tensor->shape() == expected_sizes_shape,
+                errors::InvalidArgument(
+                    "sizes should be batch_size + [len(field_names)].  Saw: ",
+                    sizes_tensor->shape().DebugString(),
+                    " but expected: ", expected_sizes_shape.DebugString()));
+
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+
+    for (int i = 0; i < field_descs_.size(); ++i) {
+      const Tensor& v = values[i];
+      int max_size = v.dim_size(v.dims() - 1);
+
+      // The last dimension of a value tensor must be greater than the
+      // corresponding
+      // size in the sizes tensor.
+      for (int message_index = 0; message_index < message_count;
+           message_index++) {
+        OP_REQUIRES(
+            cx, sizes(message_index, i) <= max_size,
+            errors::InvalidArgument(
+                "Size to write must not be larger than value tensor; but saw: ",
+                sizes(message_index, i), " > ", max_size, " at message ",
+                message_index, " field ", i));
+      }
+    }
+
+    // This pointer is owned by the context.
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(cx, cx->allocate_output(0, common_prefix, &output_tensor));
+
+    auto bufs = output_tensor->flat<string>();
+    for (int message_index = 0; message_index < message_count;
+         message_index++) {
+      // TODO(nix): possibly optimize allocation here by calling
+      //   bufs(message_index).reserve(DEFAULT_BUF_SIZE);
+      StringOutputStream output_string(&bufs(message_index));
+      CodedOutputStream out(&output_string);
+      // Write fields in ascending field_number order.
+      for (int i : sorted_field_index_) {
+        auto& field_desc = *field_descs_[i];
+        const Tensor& v = values[i];
+        int size = sizes(message_index, i);
+        if (!size) continue;
+        WriteField(field_desc, v, message_index, size, &out);
+      }
+    }
+  }
+
+ private:
+  std::vector<string> field_names_;
+  std::vector<const FieldDescriptor*> field_descs_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+
+  // Contains indices into field_names_, sorted by field number since
+  // that's the order of writing.
+  std::vector<int> sorted_field_index_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EncodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("EncodeProto").Device(DEVICE_CPU), EncodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 3487606778eabde386335f8450d627b7bf74ad42..050c95cf40d4b29bde66b6b6e72b1b48a7199965 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -76,7 +76,7 @@ struct FillFunctor<GPUDevice, T> {
 };
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
-TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_NUMBER_TYPES(DEFINE_FILL_GPU);
 TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 16ccb03b8502dd626c0dc4f0c10fcfe50224c7b8..2c6e8bf3bcbd9270ed47d37eec6c88d7b3cfdb1c 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
@@ -50,7 +51,7 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   }
   // Compute slice_bytes here so that static knowledge is available
   const size_t slice_bytes = slice_elems * sizeof(T);
-  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
   // Store the value of invalidate index for printing error information, it's a
   // shared variable.
@@ -162,6 +163,16 @@ struct GatherFunctor<CPUDevice, T, Index> {
   }
 };
 
+template <typename Index>
+struct GatherFunctor<GPUDevice, Variant, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<Variant, 3>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<Variant, 3>::Tensor out) {
+    return GatherFunctorCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index a71d047ed1a381bfc0311f86987f585f51b02536..ef6ce0546b0811edda3331de69906237cca76dd4 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -213,13 +213,13 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_EMPTY(type)                                  \
+#define REGISTER_PARALLEL_CONCAT_START(type)                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_GPU)             \
                               .TypeConstraint<type>("dtype"), \
                           ParallelConcatStart<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START)
+#undef REGISTER_PARALLEL_CONCAT_START
 
 #define REGISTER_PARALLEL_CONCAT(type)                                     \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -248,5 +248,295 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
+class InplaceOpBase : public OpKernel {
+ public:
+  explicit InplaceOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    auto i = ctx->input(1);
+    auto v = ctx->input(2);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(i.shape()),
+                errors::InvalidArgument("i must be a vector. ",
+                                        i.shape().DebugString()));
+    OP_REQUIRES(ctx, x.dims() == v.dims(),
+                errors::InvalidArgument(
+                    "x and v shape doesn't match (ranks differ): ",
+                    x.shape().DebugString(), " vs. ", v.shape().DebugString()));
+    for (int i = 1; i < x.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, x.dim_size(i) == v.dim_size(i),
+          errors::InvalidArgument("x and v shape doesn't match at index ", i,
+                                  " : ", x.shape().DebugString(), " vs. ",
+                                  v.shape().DebugString()));
+    }
+    OP_REQUIRES(ctx, i.dim_size(0) == v.dim_size(0),
+                errors::InvalidArgument(
+                    "i and x shape doesn't match at index 0: ",
+                    i.shape().DebugString(), " vs. ", v.shape().DebugString()));
+
+    Tensor y = x;  // This creates an alias intentionally.
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, i, v, &y));
+    ctx->set_output(0, y);
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& i,
+                           const Tensor& v, Tensor* y) = 0;
+};
+
+}  // end namespace
+
+namespace functor {
+
+template <typename T>
+void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<T>();
+  auto Ty = y->flat_outer_dims<T>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    switch (op) {
+      case I_UPDATE:
+        Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+        break;
+      case I_ADD:
+        Ty.template chip<0>(r).device(d) += Tv.template chip<0>(j);
+        break;
+      case I_SUB:
+        Ty.template chip<0>(r).device(d) -= Tv.template chip<0>(j);
+        break;
+    }
+  }
+}
+
+// String type only supports inplace update.
+void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
+                             const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<string>();
+  auto Ty = y->flat_outer_dims<string>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+  }
+}
+
+template <>
+Status DoInplace(const CPUDevice& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  if (op == I_UPDATE) {
+    if (v.dtype() == DT_STRING) {
+      DoInplaceStringUpdateOp(device, i, v, y);
+      return Status::OK();
+    } else if (v.dtype() == DT_BOOL) {
+      DoInplaceOp<bool>(device, op, i, v, y);
+      return Status::OK();
+    }
+  }
+  switch (v.dtype()) {
+#define CASE(type)                          \
+  case DataTypeToEnum<type>::value:         \
+    DoInplaceOp<type>(device, op, i, v, y); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, functor::InplaceOpType op>
+class InplaceOp : public InplaceOpBase {
+ public:
+  explicit InplaceOp(OpKernelConstruction* ctx) : InplaceOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& i, const Tensor& v,
+                   Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoInplace(d, op, i, v, y);
+  }
+};
+
+class CopyOpBase : public OpKernel {
+ public:
+  explicit CopyOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    Tensor* y;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, x, y));
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& x,
+                           Tensor* y) = 0;
+};
+
+template <typename Device>
+class CopyOp : public CopyOpBase {
+ public:
+  explicit CopyOp(OpKernelConstruction* ctx) : CopyOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& x, Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoCopy(d, x, y);
+  }
+};
+
+}  // end namespace
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <>
+Status DoCopy(const CPUDevice& device, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                                   \
+  case DataTypeToEnum<type>::value:                  \
+    y->flat<type>().device(device) = x.flat<type>(); \
+    break;
+
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_bool(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", x.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, typename T>
+class EmptyOp : public OpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape.shape()),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape.shape().DebugString()));
+    auto dims = shape.flat<int32>();
+    TensorShape out_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                            reinterpret_cast<const int32*>(dims.data()),
+                            dims.size(), &out_shape));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (init_) {
+      functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                           out->flat<T>());
+    }
+  }
+
+ private:
+  bool init_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
+
+#define REGISTER_EMPTY(type, dev)                             \
+  REGISTER_KERNEL_BUILDER(Name("Empty")                       \
+                              .Device(DEVICE_##dev)           \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<type>("dtype"), \
+                          EmptyOp<dev##Device, type>)
+
+REGISTER_EMPTY(float, CPU)
+REGISTER_EMPTY(double, CPU)
+REGISTER_EMPTY(Eigen::half, CPU)
+REGISTER_EMPTY(string, CPU)
+REGISTER_EMPTY(int32, CPU)
+REGISTER_EMPTY(int64, CPU)
+REGISTER_EMPTY(bool, CPU)
+
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      InplaceOp<GPUDevice, functor::I_UPDATE>);                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceAdd").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_ADD>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceSub").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_SUB>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DeepCopy").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),      \
+      CopyOp<GPUDevice>);
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(Eigen::half);
+REGISTER(int64);
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+
+REGISTER_KERNEL_BUILDER(Name("DeepCopy")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        CopyOp<CPUDevice>);
+REGISTER_EMPTY(float, GPU);
+REGISTER_EMPTY(double, GPU);
+REGISTER_EMPTY(Eigen::half, GPU);
+REGISTER_EMPTY(int64, GPU);
+
+#endif  // GOOGLE_CUDA
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index 53529f51653f35f0815bb640ec244e4acccade2a..b806787e91c39d0add8ec6bb386a56d12a3b4b24 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -26,6 +26,23 @@ template <typename Device>
 Status DoParallelConcat(const Device& device, const Tensor& value, int32 loc,
                         Tensor* output);
 
+// Inplace update/add/sub values in 'y'. It computes
+//   y[i, :] = v if op is I_UPDATE
+//   y[i, :] += v if op is I_ADD
+//   y[i, :] -= v if op is I_SUB
+// Returns an error if the operation fails.
+enum InplaceOpType {
+  I_UPDATE,  // x = y
+  I_ADD,     // x += y
+  I_SUB,     // x -= y
+};
+template <typename Device>
+Status DoInplace(const Device& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y);
+// Copies x into y.
+template <typename Device>
+Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 8467360435af7b2a267a14e62d432808ec39e239..f1616b1ea88c93fc8ce039c8afd0be0d13504317 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -77,6 +77,103 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
   return Status::OK();
 }
 
+template <typename T, InplaceOpType op>
+__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
+                                  const int64 cols, const int64 n, const T* src,
+                                  const int32* rowids, T* dst) {
+  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
+    int64 r = idx / cols;
+    int64 c = idx % cols;
+    r = (rowids[r] % rows + rows) % rows;  // Guard index range.
+    T* p = dst + r * cols + c;
+    const T* q = src + idx;
+    switch (op) {
+      case I_UPDATE:
+        *p = ldg(q);
+        break;
+      case I_ADD:
+        *p += ldg(q);
+        break;
+      case I_SUB:
+        *p -= ldg(q);
+        break;
+    }
+  }
+}
+
+template <typename T>
+void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  const int64 nelem = v.NumElements();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  auto Ty = y->flat_outer_dims<T>();
+  const int64 nrows = Ty.dimension(0);
+  const int64 ncols = Ty.dimension(1);
+  const int64 n = i.NumElements();
+  const T* src = v.flat<T>().data();
+  // TODO(sjhwang): Check that first dimension fits in int32 range.
+  const int32* rowids = i.flat<int32>().data();
+  T* dst = y->flat<T>().data();
+  switch (op) {
+    case I_UPDATE:
+      DoInplaceOpKernel<T, I_UPDATE>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_ADD:
+      DoInplaceOpKernel<T, I_ADD>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_SUB:
+      DoInplaceOpKernel<T, I_SUB>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+  }
+}
+
+template <>
+Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  switch (v.dtype()) {
+#define CASE(type)                     \
+  case DataTypeToEnum<type>::value:    \
+    DoInplaceOp<type>(d, op, i, v, y); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                              \
+  case DataTypeToEnum<type>::value:             \
+    y->flat<type>().device(d) = x.flat<type>(); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported dtype: ", x.dtype());
+  }
+  return Status::OK();
+}
+
 }  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 9e7786f25e052b0113a8020e3af1e015eae41b8d..d1e481d7ccf111c97583d0d1fe805efc813797c9 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -475,6 +475,22 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA
 
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListPushBackBatch<CPUDevice, T>)
+
+TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
+
 #define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 935f892dd0515025e97e02c8e941b96f21ed3b3e..0ea9362cbe4da46d531086aef71618c3382a25e7 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -51,6 +51,21 @@ REGISTER_TENSOR_LIST_STACK_GPU(bool);
 
 #undef REGISTER_TENSOR_LIST_STACK_GPU
 
+#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU),                \
+                          TensorListPushBackBatch<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
+REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
+
 #define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
   REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
                               .TypeConstraint<T>("element_dtype") \
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index f3bbf3b6e37d0a2852b68a018e9d32ac88f610a7..42871c611301be2671a9c25e1e46abb0dc0a7b13 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -34,6 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
 // Variant compatible type for a list of tensors. This is mutable but instances
 // should never be mutated after stored in a variant tensor.
 struct TensorList {
@@ -146,6 +148,10 @@ class TensorListFromTensor : public OpKernel {
     TensorList output_list;
     const Tensor& t = c->input(0);
     output_list.element_dtype = t.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    t.shape().DebugString()));
     TensorShape output_shape(t.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
@@ -267,6 +273,121 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   return Status::OK();
 }
 
+template <typename Device, typename T>
+class TensorListPushBackBatch : public OpKernel {
+ public:
+  explicit TensorListPushBackBatch(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListPushBackBatch() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(1);
+    OP_REQUIRES(c, element_dtype_ == input.dtype(),
+                errors::InvalidArgument("Invalid data types; list elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but tried to append ",
+                                        DataTypeString(input.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument(
+                    "Expected tensor to be at least a vector, but saw shape: ",
+                    input.shape().DebugString()));
+
+    const TensorShape& tls_shape = c->input(0).shape();
+
+    // For purposes of input forwarding, we want the least restrictive
+    // AllocatorAttributes possible.  If we need to allocate later,
+    // we'll request the DT_VARIANT be allocated on host.
+    AllocatorAttributes attr;
+
+    std::unique_ptr<Tensor> tls_alias = c->forward_input(
+        0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape,
+        DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr);
+
+    const Tensor& tls = tls_alias ? *tls_alias : c->input(0);
+
+    OP_REQUIRES(c, tls.dtype() == DT_VARIANT,
+                errors::InvalidArgument(
+                    "Expected input_handles dtype to be Variant, but saw: ",
+                    DataTypeString(tls.dtype())));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(tls_shape),
+                errors::InvalidArgument(
+                    "Expected input_handles to be a vector, but saw shape: ",
+                    tls_shape.DebugString()));
+    const int64 batch_size = tls.NumElements();
+    OP_REQUIRES(c, input.dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "Expected tensor.shape[0] == input_handles.size, but saw ",
+                    input.dim_size(0), " vs. ", batch_size));
+    auto tls_t = tls.vec<Variant>();
+
+    TensorShape input_element_shape = input.shape();
+    input_element_shape.RemoveDim(0);
+    std::vector<const TensorList*> tl_batch;
+    for (int64 b = 0; b < batch_size; ++b) {
+      const TensorList* l = tls_t(b).get<TensorList>();
+      OP_REQUIRES(c, l != nullptr,
+                  errors::InvalidArgument("Input handle at index ", b,
+                                          " is not a list. Saw: '",
+                                          tls_t(b).DebugString(), "'"));
+      OP_REQUIRES(
+          c, l->element_shape.IsCompatibleWith(input_element_shape),
+          errors::InvalidArgument(
+              "Tried to append a tensor with incompatible shape to a "
+              "list at index ",
+              b, ". Op element shape: ", input_element_shape.DebugString(),
+              " list shape: ", l->element_shape.DebugString()));
+      OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                  errors::InvalidArgument(
+                      "Invalid data type at index ", b, "; op elements ",
+                      DataTypeString(element_dtype_), " but list elements ",
+                      DataTypeString(l->element_dtype)));
+      tl_batch.push_back(l);
+    }
+
+    Tensor* result;
+
+    if (tls_alias) {
+      result = tls_alias.get();
+      c->set_output(0, *result);
+    } else {
+      // DT_VARIANT tensors always allocated on host.
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          c, c->allocate_output(0, TensorShape{batch_size}, &result, attr));
+    }
+
+    if (batch_size == 0) {
+      return;
+    }
+
+    auto input_t = input.flat_outer_dims<T, 2>();
+    auto result_t = result->vec<Variant>();
+
+    for (int64 b = 0; b < batch_size; ++b) {
+      if (!tls_alias) {
+        result_t(b) = *tl_batch[b];
+      }
+      TensorList* output = result_t(b).get<TensorList>();
+      DCHECK(output != nullptr);
+      Tensor* frame;
+      PersistentTensor tmp;
+      OP_REQUIRES_OK(c, c->allocate_persistent(
+                            element_dtype_, input_element_shape, &tmp, &frame));
+      if (input_element_shape.num_elements() > 0) {
+        auto frame_t = frame->flat<T>();
+        frame_t.device(c->eigen_device<Device>()) = input_t.template chip<0>(b);
+      }
+      output->tensors.push_back(std::move(*frame));
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 27031d9216129b842195993279f6d6c2acf7fb5f..77386a16e01352a7691c744ee882c5c6e1b0d5d9 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -101,9 +101,10 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && next_id_ != total_size()) {
+      if (errors::IsOutOfRange(status_) && vocab_size_ != -1 &&
+          next_id_ != vocab_size_) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
-                                          ": expected ", total_size(),
+                                          ": expected ", vocab_size_,
                                           " but got ", next_id_);
       }
       valid_ = false;
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index d086abb24760f1ab946605fd422a4fd0d5fc866d..7a647884486b52943c068efce6c5cccc73a97855 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -127,18 +128,16 @@ struct MultinomialFunctor<CPUDevice, T, OutputType> {
 
 }  // namespace functor
 
+namespace {
+
 // Samples from a multinomial distribution.
 template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
-  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, generator_.Init(context));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& logits_t = ctx->input(0);
-    const Tensor& num_samples_t = ctx->input(1);
+  explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+  void DoCompute(OpKernelContext* ctx, const Tensor& logits_t,
+                 const Tensor& num_samples_t, GuardedPhiloxRandom* generator) {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_t.shape()),
                 errors::InvalidArgument("logits should be a matrix, got shape ",
                                         logits_t.shape().DebugString()));
@@ -194,7 +193,7 @@ class MultinomialOp : public OpKernel {
       // CPU generates doubles = 2 samples per number.
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
-          generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
+          generator->ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
       functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
@@ -202,24 +201,38 @@ class MultinomialOp : public OpKernel {
           samples_t->matrix<OutputType>());
     }
   }
+};
+
+template <typename Device, typename T, typename OutputType>
+class StatefulMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatefulMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {
+    OP_REQUIRES_OK(ctx, generator_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator_);
+  }
 
  private:
   GuardedPhiloxRandom generator_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                   \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<CPUDevice, TYPE, int32>);        \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<TYPE>("T")                 \
-                              .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<CPUDevice, TYPE, int64>);
+// TODO(b/77906027): Add a TPU implementation.
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatefulMultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -233,13 +246,83 @@ TF_CALL_double(REGISTER);
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT32), \
-                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+                          StatefulMultinomialOp<GPUDevice, TYPE, int32>) \
   REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
                               .Device(DEVICE_GPU)                        \
                               .HostMemory("num_samples")                 \
                               .TypeConstraint<TYPE>("T")                 \
                               .TypeConstraint("output_dtype", DT_INT64), \
-                          MultinomialOp<GPUDevice, TYPE, int64>)
+                          StatefulMultinomialOp<GPUDevice, TYPE, int64>)
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, typename OutputType>
+class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
+ public:
+  explicit StatelessMultinomialOp(OpKernelConstruction* ctx)
+      : MultinomialOp<Device, T, OutputType>(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& logits_t = ctx->input(0);
+    const Tensor& num_samples_t = ctx->input(1);
+
+    const Tensor& seed_t = ctx->input(2);
+    OP_REQUIRES(ctx, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_t, &key, &counter));
+
+    GuardedPhiloxRandom generator;
+    generator.Init(counter, key);
+
+    this->DoCompute(ctx, logits_t, num_samples_t, &generator);
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+#define REGISTER(TYPE)                                                     \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT32),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                     \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("T")                   \
+                              .TypeConstraint("output_dtype", DT_INT64),   \
+                          StatelessMultinomialOp<CPUDevice, TYPE, int64>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#if GOOGLE_CUDA
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT32),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("num_samples")                  \
+                              .HostMemory("seed")                         \
+                              .TypeConstraint<TYPE>("T")                  \
+                              .TypeConstraint("output_dtype", DT_INT64),  \
+                          StatelessMultinomialOp<GPUDevice, TYPE, int64>)
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -248,4 +331,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // end namespace
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index f49a05c70ad122ce5da17ef91f279255ad18e306..916869fb566f11495bc2c6e86ff9213913347b2e 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -279,64 +279,6 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
-template <typename Device>
-Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
-
-#define CPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-
-#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
-  template <>                                                                 \
-  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
-                               Tensor* to) {                                  \
-    PersistentTensor tmp;                                                     \
-    Tensor* tensor;                                                           \
-    AllocatorAttributes attr;                                                 \
-    attr.set_gpu_compatible(true);                                            \
-    attr.set_nic_compatible(true);                                            \
-    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
-        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
-    switch (from.dtype()) {                                                   \
-      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
-      default:                                                                \
-        return errors::InvalidArgument(                                       \
-            "VariantCopyFn: Could not perform a deep copy of variant "        \
-            "element of type: ",                                              \
-            DataTypeString(from.dtype()),                                     \
-            " using device: ", context->device()->name());                    \
-    }                                                                         \
-    *to = *tensor;                                                            \
-    return Status::OK();                                                      \
-  }
-
-INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
-
-#if GOOGLE_CUDA
-#define GPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
-  TF_CALL_GPU_ALL_TYPES(T);                 \
-  TF_CALL_int32(T);                         \
-  TF_CALL_int64(T);
-INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
-                                GPU_DENSE_COPY);
-#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
-#undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
-
-#undef CPU_DENSE_COPY
-#undef INSTANTIATE_GET_VARIANT_COPY_FN
-
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -364,15 +306,23 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
+    // For purposes of forwarding DT_VARIANT, we want the least
+    // restrictive attr; we already know the input is on host.
     AllocatorAttributes attr;
-    attr.set_on_host(true);
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
     // Note that Variant objects themselves always reside on host.
+    //
+    // We nevertheless want to signal to the runtime that the tensor
+    // should reside in memory of the associated device, as Variant
+    // tensors may be marked as sitting on either CPU or GPU.  This
+    // helps to elide one or more copies.
     std::unique_ptr<Tensor> input_alias = context->forward_input(
         1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
-        value.shape(), HOST_MEMORY, attr);
+        value.shape(),
+        DEVICE_MEMORY /* HOST_MEMORY is only reserved for special cases */,
+        attr);
 
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
@@ -388,6 +338,8 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
         !variable->tensor()->shape().IsSameSize(value.shape())) {
       PersistentTensor unused;
       Tensor* tmp;
+      // Allocation of DT_VARIANT is always on host.
+      attr.set_on_host(true);
       OP_REQUIRES_OK(context,
                      context->allocate_persistent(DT_VARIANT, value.shape(),
                                                   &unused, &tmp, attr));
@@ -396,12 +348,8 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
-    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
-                             std::placeholders::_1, std::placeholders::_2);
     for (int64 i = 0; i < elements_in.size(); ++i) {
-      OP_REQUIRES_OK(context, VariantDeviceCopy(
-                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                                  elements_in(i), &elements_out(i), copy_fn));
+      elements_out(i) = elements_in(i);
     }
   }
 
@@ -560,7 +508,14 @@ class ResourceGatherOp : public OpKernel {
     }
 
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    Tensor tmp;
+    if (params.dtype() == DT_VARIANT) {
+      tmp = Tensor(DT_VARIANT, result_shape);
+      c->set_output(0, tmp);
+      out = &tmp;
+    } else {
+      OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    }
     if (N > 0) {
       const int64 gather_dim_size = params.dim_size(0);
       int64 inner_size = 1;
@@ -607,6 +562,23 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 
+// Variant objects themselves sit on CPU, even if they contain data
+// pointing to a device.
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int64>)
+
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_GATHER_CPU
@@ -721,6 +693,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
@@ -733,6 +707,23 @@ REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int64,
+                                                scatter_op::UpdateOp::ASSIGN>)
+
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_SCATTER_ARITHMETIC
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index bcbdbee058b4fdb587f2099c54545b8a6aec8ca9..4b630809c5a85496dc57476c6291729f54abc5a7 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -254,8 +254,11 @@ class RollOp : public OpKernel {
     // total modulo sum of shifts for each dimension
     gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
-      const int axis = axis_flat(i);
-      OP_REQUIRES(context, axis < num_dims,
+      int axis = axis_flat(i);
+      if (axis < 0) {
+        axis += num_dims;
+      }
+      OP_REQUIRES(context, 0 <= axis && axis < num_dims,
                   errors::InvalidArgument("axis ", axis, " is out of range"));
       const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
       const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
diff --git a/tensorflow/core/kernels/rpc_op.cc b/tensorflow/core/kernels/rpc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2447ef504044e2289a99d19630112d33b0147a8a
--- /dev/null
+++ b/tensorflow/core/kernels/rpc_op.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// RpcOp is a TensorFlow op that sends and receives arbitrary messages.
+//
+// See docs in ../ops/rpc_op.cc.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+class RpcOp : public AsyncOpKernel {
+ public:
+  explicit RpcOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("protocol", &protocol_));
+    OP_REQUIRES(context, !protocol_.empty(),
+                errors::InvalidArgument("protocol must be non-empty."));
+    bool fail_fast;
+    OP_REQUIRES_OK(context, context->GetAttr("fail_fast", &fail_fast));
+    int64 timeout_in_ms;
+    OP_REQUIRES_OK(context, context->GetAttr("timeout_in_ms", &timeout_in_ms));
+
+    RPCFactoryRegistry::RPCFactoryFn* rpc_factory_fn =
+        RPCFactoryRegistry::Global()->Get(protocol_);
+    OP_REQUIRES(context, rpc_factory_fn != nullptr,
+                errors::InvalidArgument("The protocol ", protocol_,
+                                        " was not recognized."));
+
+    rpc_factory_.reset((*rpc_factory_fn)(context, fail_fast, timeout_in_ms));
+  }
+
+  ~RpcOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& address_t = ctx->input(0);
+    const Tensor& method_t = ctx->input(1);
+    const Tensor& request_t = ctx->input(2);
+
+    OP_REQUIRES_ASYNC(
+        ctx, address_t.dims() == 0 || address_t.dims() == 1,
+        errors::InvalidArgument("address must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, method_t.dims() == 0 || method_t.dims() == 1,
+        errors::InvalidArgument("method must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, request_t.dims() == 0 || request_t.dims() == 1,
+        errors::InvalidArgument("request must be a scalar or vector."), done);
+
+    TensorShape output_shape({});
+    for (const Tensor& t : {address_t, method_t, request_t}) {
+      if (t.dims() == 1) {
+        OP_REQUIRES_ASYNC(
+            ctx,
+            output_shape.dims() == 0 ||
+                output_shape.dim_size(0) == t.dim_size(0),
+            errors::InvalidArgument(
+                "Input vector shapes don't match: ", output_shape.DebugString(),
+                " vs. ", t.shape().DebugString()),
+            done);
+        output_shape = t.shape();
+      }
+    }
+
+    Tensor* response_t;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->allocate_output(0, output_shape, &response_t), done);
+
+    const bool try_rpc = (ctx->num_outputs() > 1);
+
+    Tensor* status_code_t = nullptr;
+    Tensor* status_message_t = nullptr;
+    if (try_rpc) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(1, output_shape, &status_code_t), done);
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(2, output_shape, &status_message_t), done);
+    }
+
+    if (request_t.NumElements() == 0) {
+      // Special case, we finished early!
+      done();
+      return;
+    }
+
+    int64 num_elements = output_shape.num_elements();
+
+    rpc_factory_->Call(ctx, num_elements, address_t, method_t, request_t,
+                       try_rpc, response_t, status_code_t, status_message_t,
+                       std::move(done));
+  }
+
+ private:
+  string protocol_;
+  std::unique_ptr<RPCFactory> rpc_factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RpcOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Rpc").Device(DEVICE_CPU), RpcOp);
+REGISTER_KERNEL_BUILDER(Name("TryRpc").Device(DEVICE_CPU), RpcOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 52666645bf0bb38df3fd600c602313d7b5925b00..ebaa2bd9c6253abf975c74338125529282dd7850 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -203,9 +206,9 @@ struct ScatterFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -216,6 +219,42 @@ struct ScatterFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   typename TTypes<Variant>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    DCHECK_EQ(N, updates.dimension(0));
+    DCHECK_EQ(cols, updates.dimension(1));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      for (int j = 0; j < cols; ++j) {
+        const Variant& to_scatter = updates(i, j);
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterFunctor<CPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<CPUDevice, Index> {};
+
+template <typename Index>
+struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
@@ -227,9 +266,9 @@ struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -252,9 +291,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
     const Index limit = static_cast<Index>(params.dimension(0));
     if (!std::is_same<T, string>::value) {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         memmove(params.data() + index * params.dimension(1),
@@ -263,9 +303,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
       }
     } else {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -321,9 +362,9 @@ struct ScatterScalarFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -334,6 +375,41 @@ struct ScatterScalarFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterScalarFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   const typename TTypes<Variant>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    const Variant& to_scatter = update();
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      for (Index j = 0; j < cols; ++j) {
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterScalarFunctor<CPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<CPUDevice, Index> {};
+template <typename Index>
+struct ScatterScalarFunctor<GPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
@@ -345,9 +421,9 @@ struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -370,9 +446,9 @@ struct ScatterScalarFunctorBase<CPUDevice, T, Index,
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 89abfe0eb1b49c4dae5e69803dc3b9e1cb6ba5ad..6d35ff2de6df5f0d3c6a9d06e4528cd355742eb7 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -17,6 +17,14 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 03fc4467a1dcf9d70c90c19809690934b0a7c2f4..73a02a34cf231799e6a813f042757d70b4e9414a 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -32,53 +32,8 @@ namespace {
 
 class Buffer : public ResourceBase {
  public:
-  // public types
   using Tuple = std::vector<Tensor>;
 
- private:
-  // private variables
-  std::size_t capacity_;
-  std::size_t memory_limit_;
-  std::size_t current_bytes_;
-  std::mutex mu_;
-  std::condition_variable non_empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::deque<Tuple> buf_;
-
- private:
-  // private methods
-
-  // If the buffer is configured for bounded capacity, notify
-  // waiting inserters that space is now available
-  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
-    if (IsBounded()) {
-      lock->unlock();
-      // Notify all inserters. The removal of an element
-      // may make memory available for many inserters
-      // to insert new elements
-      full_cond_var_.notify_all();
-    }
-  }
-
-  // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
-  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
-
-  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
-
-  bool WouldExceedMemoryLimit(std::size_t bytes) const {
-    return bytes + current_bytes_ > memory_limit_;
-  }
-
-  std::size_t GetTupleBytes(const Tuple& tuple) {
-    return std::accumulate(tuple.begin(), tuple.end(), 0,
-                           [](const std::size_t& lhs, const Tensor& rhs) {
-                             return lhs + rhs.TotalBytes();
-                           });
-  }
-
- public:
-  // public methods
   explicit Buffer(std::size_t capacity, std::size_t memory_limit)
       : capacity_(capacity), memory_limit_(memory_limit), current_bytes_(0) {}
 
@@ -181,6 +136,44 @@ class Buffer : public ResourceBase {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
+
+ private:
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      // Notify all inserters. The removal of an element
+      // may make memory available for many inserters
+      // to insert new elements
+      full_cond_var_.notify_all();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
+
+  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple& tuple) {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
+  }
+
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable non_empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<Tuple> buf_;
 };
 
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 88fcf542fb0cc726b228be34d0fe7b92663ce95d..eab176c7fb78c1a2f0a48b907c3b01bc640758d0 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -27,6 +27,41 @@ namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
+Status GenerateKey(Tensor seed, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter) {
+  // Grab the two seeds
+  uint64 seed0;
+  uint64 seed1;
+  if (seed.dtype() == DT_INT32) {
+    const auto seed_vals = seed.flat<int32>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else if (seed.dtype() == DT_INT64) {
+    const auto seed_vals = seed.flat<int64>();
+    seed0 = internal::SubtleMustCopy(seed_vals(0));
+    seed1 = internal::SubtleMustCopy(seed_vals(1));
+  } else {
+    return errors::InvalidArgument("Invalid seed type: ",
+                                   DataTypeString(seed.dtype()));
+  }
+
+  // Scramble the seeds so that the user doesn't need to worry about which
+  // part of the seed needs to be strong.
+  (*out_key)[0] = 0x3ec8f720;
+  (*out_key)[1] = 0x02461e29;
+  (*out_counter)[0] = static_cast<uint32>(seed0);
+  (*out_counter)[1] = static_cast<uint32>(seed0 >> 32);
+  (*out_counter)[2] = static_cast<uint32>(seed1);
+  (*out_counter)[3] = static_cast<uint32>(seed1 >> 32);
+  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
+  (*out_key)[0] = mix[0];
+  (*out_key)[1] = mix[1];
+  (*out_counter)[0] = (*out_counter)[1] = 0;
+  (*out_counter)[2] = mix[2];
+  (*out_counter)[3] = mix[3];
+  return Status::OK();
+}
+
 namespace {
 
 class StatelessRandomOpBase : public OpKernel {
@@ -49,36 +84,9 @@ class StatelessRandomOpBase : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
     if (shape.num_elements() == 0) return;
 
-    // Grab the two seeds
-    uint64 seed0;
-    uint64 seed1;
-    if (context->input_dtype(1) == DT_INT32) {
-      const auto seed = seed_t.flat<int32>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    } else {
-      CHECK_EQ(DT_INT64, context->input_dtype(1));
-      const auto seed = seed_t.flat<int64>();
-      seed0 = internal::SubtleMustCopy(seed(0));
-      seed1 = internal::SubtleMustCopy(seed(1));
-    }
-
-    // Scramble the seeds so that the user doesn't need to worry about which
-    // part of the seed needs to be strong.
     random::PhiloxRandom::Key key;
     random::PhiloxRandom::ResultType counter;
-    key[0] = 0x3ec8f720;
-    key[1] = 0x02461e29;
-    counter[0] = static_cast<uint32>(seed0);
-    counter[1] = static_cast<uint32>(seed0 >> 32);
-    counter[2] = static_cast<uint32>(seed1);
-    counter[3] = static_cast<uint32>(seed1 >> 32);
-    const auto mix = random::PhiloxRandom(counter, key)();
-    key[0] = mix[0];
-    key[1] = mix[1];
-    counter[0] = counter[1] = 0;
-    counter[2] = mix[2];
-    counter[3] = mix[3];
+    OP_REQUIRES_OK(context, GenerateKey(seed_t, &key, &counter));
 
     // Fill in the random numbers
     Fill(context, random::PhiloxRandom(counter, key), output);
@@ -105,8 +113,6 @@ class StatelessRandomOp : public StatelessRandomOpBase {
   }
 };
 
-}  // namespace
-
 #define REGISTER(TYPE)                                                 \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("StatelessRandomUniform")                                   \
@@ -176,4 +182,6 @@ TF_CALL_double(REGISTER);
 
 #endif  // GOOGLE_CUDA
 
+}  // namespace
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.h b/tensorflow/core/kernels/stateless_random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd29c487397726627fd23b7538d46f1f89a573f
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+// Generates a key and counter that can be used to seed a PhiloxRandom,
+// generator, based on the seed value in `seed_t`.
+//
+// REQUIRES: `seed_t` must be a length-2 vector of type DT_INT{32,64}.
+// `out_key` and `out_counter` must be non-null.
+Status GenerateKey(Tensor seed_t, random::PhiloxRandom::Key* out_key,
+                   random::PhiloxRandom::ResultType* out_counter);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_H_
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index f6e2a5ae251f57eed99b5c968d806310be11440e..7e56e15450aba23e6625b27da34a29b1ad2ecce2 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
@@ -40,14 +41,27 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
     // updating.
     PersistentTensor unused;
     Tensor* tmp;
-    AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
-    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
-                 const_cast<const Tensor*>(tensor)->flat<T>());
+    if (std::is_same<T, Variant>::value) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+
+      const auto elements_in = tensor->flat<Variant>();
+      auto elements_out = tmp->flat<Variant>();
+      for (int64 i = 0; i < elements_in.size(); ++i) {
+        elements_out(i) = elements_in(i);
+      }
+    } else {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                   const_cast<const Tensor*>(tensor)->flat<T>());
+    }
     *tensor = *tmp;
   }
   return Status::OK();
@@ -64,24 +78,21 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
   if (ctx->input_dtype(input) == DT_RESOURCE) {
     Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      core::ScopedUnref unref_var(var);
-      if (lock_held) {
+    TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
+    core::ScopedUnref unref_var(var);
+    if (lock_held) {
+      TF_RETURN_IF_ERROR(
+          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+      *out = *var->tensor();
+    } else {
+      mutex_lock ml(*var->mu());
+      if (!sparse) {
         TF_RETURN_IF_ERROR(
             PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        if (!sparse) {
-          TF_RETURN_IF_ERROR(
-              PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        }
-        *out = *var->tensor();
       }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
+      *out = *var->tensor();
     }
+    return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
   return Status::OK();
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index bb65e5357a845ebc132a8518fc28fec94b669bde..0901eba9265a48351d108a73a620dd753f4ec92f 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -321,7 +321,7 @@ TEST(FlatMap, Copy) {
     NumMap copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 09fbbb1fb6c6670d24345c0043c56df0ed2c7bb0..010b4bb5df3337ad814caa3a8767796074be1d18 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -252,7 +252,7 @@ TEST(FlatSet, Copy) {
     NumSet copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 0a76e0962e6b0939763dfddaeeb95e399670571a..0176cdc94d8b543d33ac27a651ef2ac9da241719 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -53,7 +53,7 @@ template <int size>
 struct AlignType<0, size> {
   typedef char result[size];
 };
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 #define TF_LIB_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X))
 #define TF_LIB_GTL_ALIGN_OF(T) __alignof(T)
 #elif defined(COMPILER_GCC3) || __GNUC__ >= 3 || defined(__APPLE__) || \
diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc
index 03eba4c851f8a9c7c65ddf9396d87800e0ac57df..bbffa062a93e554d9258ad29316c5e8ee2cf428e 100644
--- a/tensorflow/core/lib/strings/stringprintf.cc
+++ b/tensorflow/core/lib/strings/stringprintf.cc
@@ -22,12 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-#ifdef COMPILER_MSVC
-enum { IS_COMPILER_MSVC = 1 };
-#else
-enum { IS_COMPILER_MSVC = 0 };
-#endif
-
 void Appendv(string* dst, const char* format, va_list ap) {
   // First try with a small fixed size buffer
   static const int kSpaceLength = 1024;
@@ -48,13 +42,13 @@ void Appendv(string* dst, const char* format, va_list ap) {
       return;
     }
 
-    if (IS_COMPILER_MSVC) {
+#ifdef _MSC_VER
       // Error or MSVC running out of space.  MSVC 8.0 and higher
       // can be asked about space needed with the special idiom below:
       va_copy(backup_ap, ap);
       result = vsnprintf(nullptr, 0, format, backup_ap);
       va_end(backup_ap);
-    }
+#endif
 
     if (result < 0) {
       // Just an error.
diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc
index d61a1a945ae0b6cab5e472b41402772f04875077..02cf4cbcadc9f64cc2249f6d1b104c1a6fab103f 100644
--- a/tensorflow/core/lib/strings/stringprintf_test.cc
+++ b/tensorflow/core/lib/strings/stringprintf_test.cc
@@ -30,9 +30,9 @@ TEST(PrintfTest, Empty) {
 
 TEST(PrintfTest, Misc) {
 // MSVC does not support $ format specifier.
-#if !defined(COMPILER_MSVC)
+#if !defined(_MSC_VER)
   EXPECT_EQ("123hello w", Printf("%3$d%2$s %1$c", 'w', "hello", 123));
-#endif  // !COMPILER_MSVC
+#endif  // !_MSC_VER
 }
 
 TEST(AppendfTest, Empty) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 62ce70eb6b4bc9621f49b06d0eead85b6c76d17c..88fc03826a8dcc99c875e186dacbc31456d6fbcf 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -27,6 +27,7 @@ namespace tensorflow {
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
 
 namespace {
 
@@ -341,6 +342,50 @@ REGISTER_OP("Pack")
       return Status::OK();
     });
 
+REGISTER_OP("DeepCopy")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetIsStateful()
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceUpdate")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceAdd")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceSub")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("Empty")
+    .Input("shape: int32")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("init: bool = false")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Unpack")
     .Input("value: T")
@@ -384,6 +429,58 @@ REGISTER_OP("UnravelIndex")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
+REGISTER_OP("BroadcastTo")
+    .Input("input: T")
+    .Input("shape: Tidx")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle in = c->input(0);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+
+      if (!c->RankKnown(out)) {
+        // We have no information about the shape of the output.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+
+      if (!c->RankKnown(in)) {
+        // We have no information about the shape of the input,
+        // nothing to do here.
+        c->set_output(0, out);
+        return Status::OK();
+      }
+      if (c->Rank(out) < c->Rank(in)) {
+        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
+                                       c->DebugString(in), " shape ",
+                                       c->DebugString(out));
+      }
+
+      int32 in_offset = c->Rank(out) - c->Rank(in);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (c->ValueKnown(dim)) {
+          // The first in_offset dimensions for input will be expanded with 1,
+          // so no check needed.
+          if (i >= in_offset) {
+            DimensionHandle in_dim = c->Dim(in, i - in_offset);
+            if (c->ValueKnown(in_dim)) {
+              if (c->Value(dim) % c->Value(in_dim) != 0) {
+                return errors::InvalidArgument(
+                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
+                    " shape ", c->DebugString(out));
+              }
+            }
+          }
+        }
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
@@ -622,7 +719,7 @@ REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "T: {bfloat16, half, float, double, int8, uint8, int16, uint16, int32, "
         "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -630,7 +727,9 @@ REGISTER_OP("OnesLike")
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -645,7 +744,9 @@ REGISTER_OP("Diag")
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -789,7 +890,7 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, bfloat16, half, "
         "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -1165,7 +1266,7 @@ REGISTER_OP("PreventGradient")
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -2450,13 +2551,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "T: {bfloat16, half, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32}")
     .Attr(
-        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
-        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "type: {bfloat16, half, float, double, int64, int32, uint8, uint16, "
+        "int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, "
+        "qint32}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       if (!c->RankKnown(input)) {
@@ -2552,7 +2652,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2");
 
@@ -2565,7 +2665,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2582,7 +2682,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 297e94655fe3c62893291de2a256b177222cd7a2..8af490341899bcc53a98bbbcba1b9cf2b927cb8a 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -128,6 +128,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
     .Output("num_trees: int32")
     .Output("num_finalized_trees: int32")
     .Output("num_attempted_layers: int32")
+    .Output("last_layer_nodes_range: int32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused_input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
@@ -135,6 +136,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       c->set_output(3, c->Scalar());
+      c->set_output(4, c->Vector(2));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6157a69df5cf535a0957df8b7ed6d4f597acd1d
--- /dev/null
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("CollectiveReduce")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("subdiv_offsets: list(int)")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("CollectiveBcastSend")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+REGISTER_OP("CollectiveBcastRecv")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 10b24c2d3426ddb37d521fbbf4197318b9789d08..a45a95ae096c4fdff0646a168eff715a23fd2a4f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -64,6 +64,31 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -607,6 +632,33 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -656,6 +708,31 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -725,6 +802,41 @@ op {
     }
   }
 }
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "AddManySparseToTensorsMap"
   input_arg {
@@ -1094,6 +1206,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -6166,6 +6314,33 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Asinh"
   input_arg {
@@ -6215,6 +6390,31 @@ op {
     }
   }
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Assert"
   input_arg {
@@ -6761,6 +6961,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Atan2"
   input_arg {
@@ -6812,6 +7039,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Atanh"
   input_arg {
@@ -6861,6 +7115,31 @@ op {
     }
   }
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -7331,6 +7610,111 @@ op {
     }
   }
 }
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AvgPool3DGrad"
   input_arg {
@@ -7367,6 +7751,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -7432,6 +7829,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7492,6 +7890,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -8328,6 +8727,50 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "BatchMatrixBandPart"
   input_arg {
@@ -10155,99 +10598,160 @@ op {
   }
 }
 op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "Bitcast"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_INT8
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "BitwiseOr"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "type"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "BitwiseOr"
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -10477,6 +10981,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
@@ -11081,6 +11589,29 @@ op {
     }
   }
 }
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CheckNumerics"
   input_arg {
@@ -11134,6 +11665,33 @@ op {
     type: "string"
   }
 }
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
 op {
   name: "Cholesky"
   input_arg {
@@ -11211,6 +11769,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -13318,6 +14017,31 @@ op {
     }
   }
 }
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Cosh"
   input_arg {
@@ -13367,6 +14091,31 @@ op {
     }
   }
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -15782,6 +16531,65 @@ op {
     }
   }
 }
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DecodeCompressed"
   input_arg {
@@ -15906,6 +16714,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -16004,6 +16861,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -16570,114 +17443,15 @@ op {
   }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
-    name: "filter"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
+    name: "filter_sizes"
     type: DT_INT32
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
   input_arg {
     name: "out_backprop"
     type_attr: "T"
@@ -16691,6 +17465,245 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -17054,6 +18067,58 @@ op {
     }
   }
 }
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DiagPart"
   input_arg {
@@ -17069,6 +18134,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17095,6 +18161,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17150,6 +18217,29 @@ op {
     }
   }
 }
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Dilation2D"
   input_arg {
@@ -17923,6 +19013,41 @@ op {
     }
   }
 }
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DrawBoundingBoxes"
   input_arg {
@@ -18218,6 +19343,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -18379,6 +19527,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -18524,6 +19708,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Erf"
   input_arg {
@@ -18569,6 +19793,29 @@ op {
     }
   }
 }
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Erfc"
   input_arg {
@@ -18614,6 +19861,29 @@ op {
     }
   }
 }
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -18678,6 +19948,31 @@ op {
     }
   }
 }
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExpandDims"
   input_arg {
@@ -18759,6 +20054,31 @@ op {
     }
   }
 }
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExtractGlimpse"
   input_arg {
@@ -20539,6 +21859,29 @@ op {
     }
   }
 }
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorDiv"
   input_arg {
@@ -20608,6 +21951,68 @@ op {
     }
   }
 }
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorMod"
   input_arg {
@@ -20629,6 +22034,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -20657,6 +22063,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -23248,6 +24655,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -23480,6 +24956,33 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "InvGrad"
   input_arg {
@@ -23664,6 +25167,35 @@ op {
     }
   }
 }
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Invert"
   input_arg {
@@ -23797,6 +25329,29 @@ op {
     }
   }
 }
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsInf"
   input_arg {
@@ -23842,6 +25397,29 @@ op {
     }
   }
 }
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsNan"
   input_arg {
@@ -23887,6 +25465,29 @@ op {
     }
   }
 }
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsVariableInitialized"
   input_arg {
@@ -24722,84 +26323,106 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "LessEqual"
+  name: "Lgamma"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
+  output_arg {
     name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -24820,6 +26443,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -24841,8 +26465,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -25065,6 +26689,31 @@ op {
     }
   }
 }
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Log1p"
   input_arg {
@@ -25114,6 +26763,31 @@ op {
     }
   }
 }
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "LogMatrixDeterminant"
   input_arg {
@@ -26129,6 +27803,50 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatchingFiles"
   input_arg {
@@ -27178,9 +28896,130 @@ op {
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -27209,25 +29048,11 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -27273,6 +29098,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -27287,11 +29125,11 @@ op {
   name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
@@ -27339,6 +29177,21 @@ op {
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -27407,6 +29260,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -27419,6 +29273,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -27485,6 +29340,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -27498,6 +29354,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -30007,6 +31864,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Mean"
   input_arg {
@@ -30395,137 +32282,197 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
         type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -30544,64 +32491,33 @@ op {
   }
 }
 op {
-  name: "Min"
+  name: "Minimum"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "Minimum"
@@ -30623,6 +32539,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30651,8 +32568,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30801,6 +32718,36 @@ op {
     }
   }
 }
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Mul"
   input_arg {
@@ -30872,6 +32819,42 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Multinomial"
   input_arg {
@@ -31474,6 +33457,33 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "NegTrain"
   input_arg {
@@ -31657,6 +33667,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "NthElement"
   input_arg {
@@ -31924,6 +33974,38 @@ op {
     }
   }
 }
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -33434,6 +35516,37 @@ op {
     }
   }
 }
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "PrefetchDataset"
   input_arg {
@@ -34288,6 +36401,117 @@ op {
     version: 22
   }
 }
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV2"
   input_arg {
@@ -34332,6 +36556,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34383,6 +36608,54 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34430,6 +36703,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34478,6 +36752,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -38530,6 +40805,41 @@ op {
     }
   }
 }
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Reciprocal"
   input_arg {
@@ -38583,6 +40893,33 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ReciprocalGrad"
   input_arg {
@@ -38668,6 +41005,35 @@ op {
     }
   }
 }
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RecordInput"
   output_arg {
@@ -48266,6 +50632,56 @@ op {
     }
   }
 }
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "RightShift"
   input_arg {
@@ -48341,6 +50757,29 @@ op {
     }
   }
 }
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Roll"
   input_arg {
@@ -48437,6 +50876,123 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -48452,6 +51008,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -48462,22 +51019,25 @@ op {
   }
 }
 op {
-  name: "Rsqrt"
+  name: "RsqrtGrad"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -48489,11 +51049,11 @@ op {
 op {
   name: "RsqrtGrad"
   input_arg {
-    name: "x"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "dy"
     type_attr: "T"
   }
   output_arg {
@@ -48534,6 +51094,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -48561,8 +51122,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -52767,6 +55328,31 @@ op {
     }
   }
 }
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SigmoidGrad"
   input_arg {
@@ -52852,6 +55438,35 @@ op {
     }
   }
 }
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sign"
   input_arg {
@@ -52905,6 +55520,33 @@ op {
     }
   }
 }
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sin"
   input_arg {
@@ -52954,6 +55596,31 @@ op {
     }
   }
 }
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sinh"
   input_arg {
@@ -53003,6 +55670,31 @@ op {
     }
   }
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Size"
   input_arg {
@@ -62161,6 +64853,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -62246,6 +64963,35 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -62299,6 +65045,33 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SquaredDifference"
   input_arg {
@@ -62362,6 +65135,38 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Squeeze"
   input_arg {
@@ -62731,6 +65536,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -63579,6 +66449,41 @@ op {
     }
   }
 }
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Substr"
   input_arg {
@@ -64219,6 +67124,33 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -64268,6 +67200,31 @@ op {
     }
   }
 }
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TanhGrad"
   input_arg {
@@ -64353,6 +67310,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -65710,6 +68696,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
@@ -66630,6 +69635,68 @@ op {
     }
   }
 }
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "TruncateMod"
   input_arg {
@@ -66651,6 +69718,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -66679,6 +69747,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -66780,6 +69849,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b25abbcc6780364c27c1a078b0a8980014c83a43..8a7185e00508f9865acfadadf9a5c3f43256d27b 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -141,7 +141,11 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("LatencyStatsDataset")
     .Input("input_dataset: variant")
@@ -149,7 +153,11 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/decode_proto_ops.cc b/tensorflow/core/ops/decode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f6fb2f58234c564960ad378867a6af27d1b5d2e
--- /dev/null
+++ b/tensorflow/core/ops/decode_proto_ops.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("DecodeProtoV2")
+    .Input("bytes: string")
+    .Attr("message_type: string")
+    .Attr("field_names: list(string)")
+    .Attr("output_types: list(type) >= 0")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("message_format: string = 'binary'")
+    .Attr("sanitize: bool = false")
+    .Output("sizes: int32")
+    .Output("values: output_types")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input = c->input(0);
+
+      std::vector<tensorflow::DataType> output_types;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_types", &output_types));
+
+      ShapeHandle sizes;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(input, c->Vector(output_types.size()), &sizes));
+      c->set_output(0, sizes);
+
+      // TODO(nix): to do the best possible job of shape inference, we
+      // should examine the proto descriptors here in order to set shape
+      // indices to 1 instead of unknown for optional or required fields.
+      // Any general-purpose code will have to handle the unknown case,
+      // but there might be XLA code that could be sped up with the additional
+      // knowledge.
+      for (int i = 0; i < output_types.size(); ++i) {
+        ShapeHandle values;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(input, c->Vector(c->UnknownDim()), &values));
+        c->set_output(i + 1, values);
+      }
+
+      return Status::OK();
+    });
+
+// TODO(nix): Consider adding an additional input argument that truncates
+// repeated fields to a maximum count. For now this could be done by passing
+// the output through tf.slice.
+
+// TODO(nix): define missing value behavior.
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/encode_proto_ops.cc b/tensorflow/core/ops/encode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec3056e35c8be82302f6cb32174661c9979225
--- /dev/null
+++ b/tensorflow/core/ops/encode_proto_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("EncodeProto")
+    .Input("sizes: int32")
+    .Input("values: Tinput_types")
+    .Attr("field_names: list(string)")
+    .Attr("message_type: string")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("Tinput_types: list(type)")
+    .Output("bytes: string")
+    .SetShapeFn([](InferenceContext* c) {
+      int first_field_index = 1;
+      int num_fields = c->num_inputs() - 1;
+
+      ShapeHandle output;
+      for (int i = num_fields - 1; i >= 0; --i) {
+        ShapeHandle input = c->input(first_field_index + i);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, 2, &input));
+        ShapeHandle inner;
+        TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &inner));
+        TF_RETURN_IF_ERROR(c->Merge(inner, output, &output));
+      }
+
+      c->set_output(0, output);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index cad617638ff12cd1020276341fbe9f9b7aac97bc..7af70110b7e8daa42be7be4ca09071ce072089ac 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -30,7 +30,8 @@ REGISTER_OP("EmptyTensorList")
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
       shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{{s, t}});
       return Status::OK();
@@ -70,6 +71,50 @@ REGISTER_OP("TensorListPushBack")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListPushBackBatch")
+    .Input("input_handles: variant")
+    .Input("tensor: element_dtype")
+    .Output("output_handles: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle input_handles;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input_handles));
+
+      shape_inference::ShapeHandle tensor;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &tensor));
+
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(tensor, input_handles, &tensor, &input_handles));
+
+      c->set_output(0, input_handles);
+
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to push to list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to push to list with wrong element dtype. List has type ",
+              DataTypeString(list_shape_type.dtype),
+              " but trying to push element with type ", DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        s = list_shape_type.shape;
+      }
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListLength")
     .Input("input_handle: variant")
     .Output("length: int32")
@@ -193,6 +238,7 @@ REGISTER_OP("TensorListReserve")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       DataType t;
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f33d51d5a20fc207102e4bf79e7605d9817eb9f..8f8443a46cfa68e9879825d36b305b4f7774bd66 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -65,7 +65,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -133,7 +133,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
@@ -148,27 +148,27 @@ REGISTER_OP("ComplexAbs")
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
       .Attr(                                                             \
-          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
           "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
   Input("x: T")                                   \
       .Output("y: T")                             \
-      .Attr("T: {half, bfloat16, float, double}") \
+      .Attr("T: {bfloat16, half, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_COMPLEX()                                                  \
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_GRADIENT_COMPLEX()                                         \
   Input("y: T")                                                          \
       .Input("dy: T")                                                    \
       .Output("z: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg").UNARY();
@@ -246,57 +246,57 @@ REGISTER_OP("Atan").UNARY();
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
 #define BINARY_MORE()                                                          \
   Input("x: T").Input("y: T").Output("z: T").Attr(                             \
-      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "T: {bfloat16, half, float, double, uint8, int8, uint16, int16, int32, " \
       "int64, complex64, complex128}")
 
 #define BINARY_FEWER()                                               \
   Input("x: T").Input("y: T").Output("z: T").Attr(                   \
-      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "T: {bfloat16, half, float, double, int32, int64, complex64, " \
       "complex128}")
 
 REGISTER_OP("Add")
@@ -304,7 +304,7 @@ REGISTER_OP("Add")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -315,7 +315,7 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
@@ -412,7 +412,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -437,7 +437,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -445,21 +445,21 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, float16, half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Pow")
@@ -467,7 +467,7 @@ REGISTER_OP("Pow")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, float, half, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -503,7 +503,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Betainc")
@@ -574,7 +574,7 @@ REGISTER_OP("GreaterEqual").COMPARISON();
       .Output("z: bool")                                                   \
       .SetIsCommutative()                                                  \
       .Attr(                                                               \
-          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
           "int64, complex64, quint8, qint8, qint32, string, bool, "        \
           "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -713,7 +713,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
@@ -1558,6 +1558,14 @@ REGISTER_OP("Bucketize")
     .Attr("boundaries: list(float)")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("ClipByValue")
+    .Input("t: T")
+    .Input("clip_value_min: T")
+    .Input("clip_value_max: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 12d6dc5eaf29569d7a0f865afb4fb26b440be60b..18165fb6edb023087e283c695d25fa2475e78a47 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1533,6 +1533,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
 does not perform anything. It is just created as an intermediate output of
@@ -1559,6 +1560,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
 2D convolution and add Bias to the output of convolution.
@@ -1681,6 +1683,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#ifdef INTEL_MKL_ML
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1697,6 +1700,7 @@ gradients of convolution with respect to the bias.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+#endif
 
 REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -2154,6 +2158,7 @@ REGISTER_OP("_MklToTf")
     .Output("output: T")
     .Attr("T: {half, float, double}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
 
@@ -2175,6 +2180,7 @@ REGISTER_OP("_MklInputConversion")
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
 need to be either in TF or in MKL format. This op is added before every
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 5764976aee1236c7a1b36d8a12e335f4aff7bc13..afb3dab3fe356e234243ef53921172ae4bba4e72 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -30,8 +30,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -210,8 +210,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -237,8 +237,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -266,8 +266,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -423,8 +423,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -1932,8 +1932,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -1959,8 +1959,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2191,8 +2191,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2223,6 +2223,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2244,8 +2245,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2448,6 +2449,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -2509,6 +2511,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -3004,8 +3007,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3854,6 +3857,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3869,7 +3873,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3879,6 +3882,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3894,7 +3898,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4132,6 +4135,10 @@ op {
     name: "num_attempted_layers"
     type: DT_INT32
   }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
   is_stateful: true
 }
 op {
@@ -4637,8 +4644,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4660,8 +4667,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4728,6 +4735,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -5759,8 +5907,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5784,8 +5932,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -7219,6 +7367,14 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "DecodeCompressed"
@@ -7344,6 +7500,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -7408,6 +7613,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -7677,6 +7898,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7746,6 +7968,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7960,6 +8183,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7986,6 +8210,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -8011,8 +8236,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8217,8 +8442,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8432,6 +8657,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -8593,6 +8841,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -8678,8 +8962,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8714,8 +8998,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8737,8 +9021,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8775,8 +9059,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8832,8 +9116,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -9700,8 +9984,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9727,8 +10011,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -9765,6 +10049,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11604,22 +11889,91 @@ op {
     type: DT_RESOURCE
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tkey"
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
   }
   input_arg {
-    name: "values"
-    type_attr: "Tval"
+    name: "v"
+    type_attr: "T"
   }
-  attr {
-    name: "Tkey"
-    type: "type"
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "Tval"
+    name: "T"
     type: "type"
   }
-  is_stateful: true
 }
 op {
   name: "InterleaveDataset"
@@ -11680,8 +12034,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -11711,8 +12065,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11799,8 +12153,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11822,8 +12176,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11845,8 +12199,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12360,8 +12714,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12508,8 +12862,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12533,8 +12887,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -13390,8 +13744,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13886,6 +14240,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -13953,6 +14308,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -13966,6 +14322,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -14625,8 +14982,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14881,8 +15238,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14997,6 +15354,8 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -15023,8 +15382,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15445,8 +15804,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -15581,8 +15940,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15746,6 +16105,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT8
@@ -17024,9 +17384,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -17456,6 +17816,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17511,6 +17872,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17559,6 +17921,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -20106,8 +20469,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -20137,8 +20500,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20168,8 +20531,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23457,8 +23820,8 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BOOL
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23516,6 +23879,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -23580,8 +23944,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -23592,6 +23956,47 @@ op {
     }
   }
 }
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -23607,8 +24012,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23636,8 +24041,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25487,8 +25892,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25516,8 +25921,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25541,8 +25946,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -25568,8 +25973,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25593,8 +25998,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28988,8 +29393,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29017,8 +29422,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29042,8 +29447,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29073,8 +29478,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29427,6 +29832,71 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -30022,8 +30492,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30407,8 +30877,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30434,8 +30904,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -30463,8 +30933,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -31577,6 +32047,25 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListReserve"
   input_arg {
@@ -32085,8 +32574,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -32123,6 +32612,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32177,6 +32667,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index ddd2aa92748f244c2d132f00780a0d6424f1e595..ddb714b4e951aa485d087daa31368bad9f1261e4 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -245,6 +245,7 @@ REGISTER_OP("DecodeCSV")
     .Attr("field_delim: string = ','")
     .Attr("use_quote_delim: bool = true")
     .Attr("na_value: string = ''")
+    .Attr("select_cols: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index f6c668f5c98efff07a49be15b1187f1858800110..416ce9c0d82ca0bfba730d3d7f4513260876e9ad 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -43,7 +43,12 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72fda5e6eba3fd4cf3e53a26e0b4d9f5d6b19100
--- /dev/null
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+Status RpcShapeOp(InferenceContext* c, bool try_rpc) {
+  ShapeHandle address;
+  ShapeHandle method;
+  ShapeHandle request;
+  ShapeHandle output;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &address));
+  if (c->Rank(address) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, address, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &method));
+  if (c->Rank(method) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, method, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &request));
+  if (c->Rank(request) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, request, &output));
+  }
+  if (!c->RankKnown(output)) {
+    output = request;
+  }
+  c->set_output(0, output);  // response
+  if (try_rpc) {
+    c->set_output(1, output);  // status_code
+    c->set_output(2, output);  // status_message
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("Rpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/false);
+    });
+
+REGISTER_OP("TryRpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .Output("status_code: int32")
+    .Output("status_message: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/true);
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 553850610a3c51986664fee52e04809626de22c1..742709fb1836a0f7e3f0bd94f3dbc3e15423a271 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -29,7 +29,7 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
 
   // Set output shape
-  shape_inference::ShapeHandle out;
+  ShapeHandle out;
   TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
   context->set_output(0, out);
   return Status::OK();
@@ -54,6 +54,32 @@ REGISTER_STATELESS_OP("StatelessRandomNormal");
 // This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_OP("StatelessMultinomial")
+    .Input("logits: T")
+    .Input("num_samples: int32")
+    .Input("seed: Tseed")
+    .Output("output: output_dtype")
+    .Attr("T: realnumbertype")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &seed));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused_dim));
+
+      ShapeHandle logits_shape;
+      ShapeHandle unused;
+      DimensionHandle num_samples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &logits_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &num_samples));
+      c->set_output(0, c->Matrix(c->Dim(logits_shape, 0), num_samples));
+      return Status::OK();
+    });
+
 #undef REGISTER_STATELESS_OP
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 3c0dc13d75fb113ece6960e6cf1e39e9c3f9adf1..6ed1d5dad2aa5a3e406c167605497986c58134a8 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -301,6 +301,14 @@ class GcsRandomAccessFile : public RandomAccessFile {
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
                                                &bytes_transferred));
     *result = StringPiece(scratch, bytes_transferred);
+    string checkpoint_ending = "/checkpoint";
+    // Check if the file is the checkpoint file as we should not be caching
+    // that. As it's contents are updated and used for iterating checkpoints.
+    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
+                   filename_.rbegin())) {
+      // Remove the checkpoint file from the cache
+      file_block_cache_->RemoveFile(filename_);
+    }
     if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 2fbde9b6a79883b674df15fb4e69b5cf3cc643a2..e9eca04fef98346627eb0ce68080a7bc4f65ec09 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -198,6 +198,54 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
+  // Our underlying file in this test changes as new data comes in
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "abcdefghi")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  {
+    // We are instantiating this in an enclosed scope to make sure after the
+    // unique ptr goes out of scope, we can still access result.
+    std::unique_ptr<RandomAccessFile> file;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
+
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
+    scratch[5] = 'x';
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("0123", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+
+    // The second chunk should not be in cache so we make a new request
+    // As the checkpoint file should not be cached
+    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    EXPECT_EQ("abcd", result);
+    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
index d555b682a624309172588c9279d650d436f5d5cd..10203783fcbe06fefeb1c7451ffbb20045cc421a 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
@@ -487,8 +487,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
         TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size / 2, &out));
         EXPECT_EQ(out.size(), block_size / 2);
       }));
-  EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
-      << "Timeout waiting for concurrent thread to start.";
+  notification.WaitForNotification();
   std::vector<char> out;
   TF_EXPECT_OK(ReadCache(&cache, "", block_size / 2, block_size / 2, &out));
   EXPECT_EQ(out.size(), block_size / 2);
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e01e076bcf279206ea821d2777a3d44755668f02..4cfa25bf66eba51e4c8646f8596c2d7b9f9a4b53 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -122,6 +122,7 @@ def cc_proto_library(
     protoc="@protobuf_archive//:protoc",
     internal_bootstrap_hack=False,
     use_grpc_plugin=False,
+    use_grpc_namespace=False,
     default_header=False,
     **kargs):
   """Bazel rule to create a C++ protobuf library from proto source files.
@@ -169,8 +170,11 @@ def cc_proto_library(
     return
 
   grpc_cpp_plugin = None
+  plugin_options = []
   if use_grpc_plugin:
     grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+    if use_grpc_namespace:
+      plugin_options = ["services_namespace=grpc"]
 
   gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
   gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
@@ -184,6 +188,7 @@ def cc_proto_library(
       protoc=protoc,
       plugin=grpc_cpp_plugin,
       plugin_language="grpc",
+      plugin_options=plugin_options,
       gen_cc=1,
       outs=outs,
       visibility=["//visibility:public"],
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 447056eb4b001b8e40eebc9c6e165023286d3c1f..44a89c3a96ad293be76d709ac21ac6bafe6afcbd 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -113,6 +113,12 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "base",
+    srcs = [],
+    copts = tf_copts(),
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
@@ -165,6 +171,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    linkstatic = 1,
+    deps = [],
+)
+
 cc_library(
     name = "test_main",
     testonly = 1,
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 2b874da1981bed396330ca3c526d82779046bdf2..c6e5777c265137ca1b215e14a7be0c6422804b4b 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
+#include <cstring>
 #endif
 
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 1b1faed7033ff52b3ffbaa17299c266ac902e470..37239681755c04152d3ae4a91ab7ec73a89f522b 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -31,13 +31,14 @@ limitations under the License.
   __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
   __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@@ -57,7 +58,7 @@ limitations under the License.
 #endif
 
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@@ -65,7 +66,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 
 #ifdef __has_builtin
 #define TF_HAS_BUILTIN(x) __has_builtin(x)
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ee423699b2f15e973326358aa38776a71951edb3..6da679dc7523f52724cf992e7ba70351de3cf393 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -156,7 +156,7 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("S3 path doesn't contain an object name: ",
diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/core/protobuf/checkpointable_object_graph.proto
similarity index 85%
rename from tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/checkpointable_object_graph.proto
index 024765acb28726fd102dfbf167f4e780072ce6e7..651f692f6d7b6d677b480a007f9ffe5c814beec3 100644
--- a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/checkpointable_object_graph.proto
@@ -2,14 +2,14 @@ syntax = "proto3";
 
 option cc_enable_arenas = true;
 
-package tensorflow.contrib.eager;
+package tensorflow;
 
-// Prototype format which saves extra information about the objects which own
-// variables, allowing for more robust checkpoint loading into modified
-// programs. Currently stored in its own entry in a TensorBundle.
+// A TensorBundle addition which saves extra information about the objects which
+// own variables, allowing for more robust checkpoint loading into modified
+// programs.
 
 message CheckpointableObjectGraph {
-  message Object {
+  message CheckpointableObject {
     message ObjectReference {
       // An index into `CheckpointableObjectGraph.nodes`, indicating the object
       // being referenced.
@@ -51,5 +51,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated Object nodes = 1;
+  repeated CheckpointableObject nodes = 1;
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a3557e4721644dd2577e7b56077a4e7ef8030463..c1a0075b6468cded7e5378ddcca1a7bcff914e98 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -409,6 +409,17 @@ message RunMetadata {
   repeated GraphDef partition_graphs = 3;
 }
 
+// Defines a connection between two tensors in a `GraphDef`.
+message TensorConnection {
+  // A tensor name. The value of this tensor will be substituted for
+  // the tensor named in `to_tensor`.
+  string from_tensor = 1;
+
+  // A tensor name. The value of this tensor will be bound to the
+  // value of the tensor named in `from_tensor`.
+  string to_tensor = 2;
+}
+
 // Defines a subgraph in another `GraphDef` as a set of feed points and nodes
 // to be fetched or executed.
 //
@@ -429,5 +440,10 @@ message CallableOptions {
   // Options that will be applied to each run.
   RunOptions run_options = 4;
 
-  // Next: 5
+  // Tensors to be connected in the callable. Each TensorConnection denotes
+  // a pair of tensors in the graph, between which an edge will be created
+  // in the callable.
+  repeated TensorConnection tensor_connection = 5;
+
+  // Next: 6
 }
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 0437cb1b83e12d83bf3b8713e2940a6d45173fb5..96c91536f7386556c4c75ef463c4f781edd0aebb 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -23,6 +23,7 @@ option java_package = "org.tensorflow.distruntime";
 
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
@@ -264,3 +265,70 @@ message ListDevicesResponse {
   repeated DeviceAttributes local_device = 1;
   repeated DeviceAttributes remote_device = 2;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// MakeCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message MakeCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // Options that define the behavior of the created callable.
+  CallableOptions options = 2;
+}
+
+message MakeCallableResponse {
+  // A handle to the created callable.
+  int64 handle = 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// RunCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RunCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+
+  // Values of the tensors passed as arguments to the callable, in the order
+  // defined in the CallableOptions.feed field passed to MakeCallable.
+  repeated TensorProto feed = 3;
+}
+
+message RunCallableResponse {
+  // Values of the tensors returned by the callable, in the order defined in the
+  // CallableOptions.fetch field passed to MakeCallable.
+  repeated TensorProto fetch = 1;
+
+  // Returned metadata if requested in the options.
+  RunMetadata metadata = 2;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ReleaseCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message ReleaseCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+}
+
+message ReleaseCallableResponse {
+}
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 771c80562a7885e983e7e25f167e0ca56bba6cc8..1170611f37232704f7702185a3009bd1fa1e3f64 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -107,4 +107,13 @@ service MasterService {
   // will no longer affect fresh ones via the resources in containers listed in
   // the ResetRequest.  See ResetRequest for more details.
   rpc Reset(ResetRequest) returns (ResetResponse);
+
+  // Registers a callable for execution with RunCallable.
+  rpc MakeCallable(MakeCallableRequest) returns (MakeCallableResponse);
+
+  // Executes a callable registered with MakeCallable.
+  rpc RunCallable(RunCallableRequest) returns (RunCallableResponse);
+
+  // Frees resources associated with a callable registered with MakeCallable.
+  rpc ReleaseCallable(ReleaseCallableRequest) returns (ReleaseCallableResponse);
 }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 706968d34745b8d21653bcee762f8a37555b93c1..0ca7d8475fc62974c7d5c5e2f171c25b14937001 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 7
+#define TF_MINOR_VERSION 8
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/guarded_philox_random.cc b/tensorflow/core/util/guarded_philox_random.cc
index 2d1e9a293e7b979076aa4d4f951b984491904c72..7c7ba4cef6bf1d20b1a9487bb9883c2a4e82bb9c 100644
--- a/tensorflow/core/util/guarded_philox_random.cc
+++ b/tensorflow/core/util/guarded_philox_random.cc
@@ -43,6 +43,14 @@ void GuardedPhiloxRandom::Init(int64 seed, int64 seed2) {
   initialized_ = true;
 }
 
+void GuardedPhiloxRandom::Init(random::PhiloxRandom::ResultType counter,
+                               random::PhiloxRandom::Key key) {
+  CHECK(!initialized_);
+  mutex_lock lock(mu_);
+  generator_ = random::PhiloxRandom(counter, key);
+  initialized_ = true;
+}
+
 random::PhiloxRandom GuardedPhiloxRandom::ReserveSamples128(int64 samples) {
   CHECK(initialized_);
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index 5b94a76777b25e0aea29ae0898e1a5e8d7fab80e..44970eb9499be37a6bdf7ad61256c72aac3bccda 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -49,6 +49,8 @@ class GuardedPhiloxRandom {
 
   // Initialize with given seeds.
   void Init(int64 seed, int64 seed2);
+  void Init(random::PhiloxRandom::ResultType counter,
+            random::PhiloxRandom::Key key);
 
   // Reserve a certain number of 128-bit samples.
   // This function is thread safe.  The returned generator is valid for the
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 1fa6b8bec037c3ee0d2b9b95f2ccce59813c98b9..d3439cbc9385184da830f70e53acb27eff570ba1 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -185,7 +185,7 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
 constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
 constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
 #else
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 76cc4911f5e067253455d4d4eb86189e7d6e9de8..958e23d28e5ee8cf0052f84064f73315bc1d117b 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,7 +53,7 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackagePrefix =
 #else
   static constexpr char kMemmappedPackagePrefix[] =
@@ -61,7 +61,7 @@ class MemmappedFileSystem : public FileSystem {
       "memmapped_package://";
 
 // The default graphdef in the package.
-#if defined(COMPILER_MSVC)
+#if defined(_MSC_VER)
   static constexpr char* kMemmappedPackageDefaultGraphDef =
 #else
   static constexpr char kMemmappedPackageDefaultGraphDef[] =
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ade14ed1620e92a2246963eaa0b317275dd4ad3d
--- /dev/null
+++ b/tensorflow/core/util/proto/BUILD
@@ -0,0 +1,62 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "decode",
+    hdrs = ["decode.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptors",
+    srcs = ["descriptors.cc"],
+    hdrs = ["descriptors.h"],
+    deps = [
+        ":descriptor_pool_registry",
+        ":local_descriptor_pool_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptor_pool_registry",
+    srcs = ["descriptor_pool_registry.cc"],
+    hdrs = ["descriptor_pool_registry.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "descriptor_pool_registry_test",
+    srcs = ["descriptor_pool_registry_test.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Depending on this target adds support for using the special
+# value "local://" (or "") for descriptor source, in which case
+# descriptors linked into the code will be searched.
+cc_library(
+    name = "local_descriptor_pool_registration",
+    srcs = ["local_descriptor_pool_registration.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
new file mode 100644
index 0000000000000000000000000000000000000000..74634a356a84db0fb72a15e223f373598c668eee
--- /dev/null
+++ b/tensorflow/core/util/proto/decode.h
@@ -0,0 +1,592 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Inline functions for parsing the protocol buffers wire format.
+//
+// These functions have been optimized at the expense of safety.
+// They are broken out into a separate file for readability but are
+// not intended for use by clients other than the decode_proto op.
+//
+// The calling code in the decode_proto op does some fairly
+// complicated things to ensure that this code is called
+// safely. Changes to this code should be thoroughly fuzz tested.
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+using tensorflow::protobuf::internal::WireFormatLite;
+using tensorflow::protobuf::io::CodedInputStream;
+using tensorflow::protobuf::io::CodedOutputStream;
+using tensorflow::protobuf::io::StringOutputStream;
+
+// Converts an uint64 to an int64 without loss of information.
+// Unsigned values greater than INT64_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int64 WrapUnsignedAsSigned64(uint64 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT64_MAX) {
+    return static_cast<int64>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT64_MIN) {
+    return static_cast<int64>(unsigned_value - INT64_MIN) + INT64_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Converts an uint32 to an int32 without loss of information.
+// Unsigned values greater than INT_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT_MAX) {
+    return static_cast<int32>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT_MIN) {
+    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value);
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+// This is slightly less efficient than the private version in
+// coded_stream.cc but we duplicate less code by calling
+// the 64 bit version instead of copying the code.
+inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
+                                          uint32* value) {
+  uint64 tmp;
+  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+  *value = tmp & 0xffffffff;
+  return buf;
+}
+
+// Reads a single proto field value from a byte array into an array.
+// The array is part of a Tensor that was allocated by the caller
+// with type TensorType, while DeclaredType is the proto field type.
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int32>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int64>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_uint64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
+                                               WireFormatLite::TYPE_FIXED64>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int32* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32,
+                                                WireFormatLite::TYPE_SFIXED32>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SFIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_int64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
+                                               WireFormatLite::TYPE_SFIXED64>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, float* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<float,
+                                                WireFormatLite::TYPE_FLOAT>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8* buf, double* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<double,
+                                                WireFormatLite::TYPE_DOUBLE>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8* buf, bool* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = temp != 0;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8* buf, int* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int>(temp);
+  return buf;
+}
+
+// Reads packed values from an array.
+// Stride is set to 1 for repeated fields, and 0 for non-repeated fields
+// (where any value overwrites previous values).
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+inline int ReadPackedPrimitives(const void* bufp, const size_t len,
+                                const int index, const int stride,
+                                void* datap) {
+  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
+  const uint8* bound = buf + len;
+  TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
+  int count;
+
+  // This could overrun the bound by stride-1. This is defended
+  // against in the caller, where it ensures that the input buffer
+  // contains complete values.
+  for (count = 0; buf < bound; count += stride) {
+    buf = ReadFromArray<TensorType, DeclaredType>(buf, data + count);
+  }
+  return count;
+}
+
+// Reads a primitive value field from a serialized proto.
+// The value is parsed from the serialized format, then static_cast
+// to the desired type for TensorFlow and stored.
+template <class ValueType, class TensorType,
+          enum WireFormatLite::FieldType DeclaredType>
+inline Status ReadPrimitive(CodedInputStream* input, int index, void* data) {
+  ValueType v;
+  if (!WireFormatLite::ReadPrimitive<ValueType, DeclaredType>(input, &v)) {
+    return errors::DataLoss("Failed reading primitive");
+  }
+
+  reinterpret_cast<TensorType*>(data)[index] = v;
+  return Status::OK();
+}
+
+// Reads a string, submessage, or other variable-length field from a
+// serialized proto.
+// May read all or part of a repeated field.
+inline Status ReadBytes(CodedInputStream* input, int index, void* datap) {
+  string* data = reinterpret_cast<string*>(datap) + index;
+  if (!WireFormatLite::ReadBytes(input, data)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+  return Status::OK();
+}
+
+// Reads a tag-delimited field (TYPE_GROUP) from a serialized proto,
+// as a bytestring.
+inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
+                             int index, void* datap) {
+  // WireFormatLite::SkipField has an option to emit the
+  // skipped bytes to an output stream. We could do better by implementing our
+  // own scanner but this is simpler for now.
+  // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
+  // on input->IsFlat() == true and using input->GetDirectBufferPointer()
+  // with input->CurrentPosition().
+  string* data = reinterpret_cast<string*>(datap) + index;
+  StringOutputStream string_stream(data);
+  CodedOutputStream out(&string_stream);
+  if (!WireFormatLite::SkipField(
+          input,
+          WireFormatLite::MakeTag(field_number,
+                                  WireFormatLite::WIRETYPE_START_GROUP),
+          &out)) {
+    return errors::DataLoss("Failed reading group");
+  }
+  return Status::OK();
+}
+
+// Reads a single field value from a CodedInputStream into a tensor.
+inline Status ReadValue(CodedInputStream* input,
+                        WireFormatLite::FieldType field_type, int field_number,
+                        DataType dtype, int index, void* datap) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return ReadPrimitive<double, double, WireFormatLite::TYPE_DOUBLE>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FLOAT:
+      if (dtype == DataType::DT_FLOAT) {
+        return ReadPrimitive<float, float, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_DOUBLE) {
+        return ReadPrimitive<float, double, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FLOAT");
+    case WireFormatLite::TYPE_INT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_INT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_UINT64:
+      return ReadPrimitive<protobuf_uint64, int64, WireFormatLite::TYPE_UINT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_INT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FIXED64:
+      return ReadPrimitive<protobuf_uint64, int64,
+                           WireFormatLite::TYPE_FIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      return ReadPrimitive<bool, bool, WireFormatLite::TYPE_BOOL>(input, index,
+                                                                  datap);
+    case WireFormatLite::TYPE_STRING:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_GROUP:
+      return ReadGroupBytes(input, field_number, index, datap);
+    case WireFormatLite::TYPE_MESSAGE:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_BYTES:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED64:
+      return ReadPrimitive<protobuf_int64, int64,
+                           WireFormatLite::TYPE_SFIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_SINT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SINT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_SINT64>(
+          input, index, datap);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads and stores a length-delimited list of values.
+inline Status ReadPackedFromArray(const void* buf, size_t buf_size,
+                                  const WireFormatLite::FieldType field_type,
+                                  const int field_number, const DataType dtype,
+                                  const int stride, int* index, void* data) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_DOUBLE>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FLOAT:
+      *index += ReadPackedPrimitives<float, WireFormatLite::TYPE_FLOAT>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_INT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_UINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      *index += ReadPackedPrimitives<bool, WireFormatLite::TYPE_BOOL>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return errors::DataLoss("Non-primitive type encountered as packed");
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_SFIXED32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SFIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SFIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads a varint from the given buffer, write it to *value, and return the
+// new buffer pointer.
+// This was copied from coded_stream.cc where it is private.
+// Important: This routine may read as much as kMaxVarintBytes from
+// the buffer. It is the caller's responsibility to make sure that there is
+// enough space in the buffer.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value) {
+  const uint8* ptr = buffer;
+  uint32 b;
+
+  // Splitting into 32-bit pieces gives better performance on 32-bit
+  // processors.
+  uint32 part0 = 0, part1 = 0, part2 = 0;
+
+  b = *(ptr++);
+  part0 = b;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80;
+  b = *(ptr++);
+  part0 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 7;
+  b = *(ptr++);
+  part0 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 14;
+  b = *(ptr++);
+  part0 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 21;
+  b = *(ptr++);
+  part1 = b;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80;
+  b = *(ptr++);
+  part1 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 7;
+  b = *(ptr++);
+  part1 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 14;
+  b = *(ptr++);
+  part1 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 21;
+  b = *(ptr++);
+  part2 = b;
+  if (!(b & 0x80)) goto done;
+  part2 -= 0x80;
+  b = *(ptr++);
+  part2 += b << 7;
+  if (!(b & 0x80)) goto done;
+  // "part2 -= 0x80 << 7" is irrelevant because (0x80 << 7) << 56 is 0.
+
+  // We have overrun the maximum size of a varint (10 bytes).  Assume
+  // the data is corrupt.
+  *ok = false;
+  return ptr;
+
+done:
+  *ok = true;
+  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
+           (static_cast<uint64>(part2) << 56);
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0423f76b74c2b24555e6908a2b61b3ba28598f
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+
+DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
+  static DescriptorPoolRegistry* registry = new DescriptorPoolRegistry;
+  return registry;
+}
+
+DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
+    const string& source) {
+  auto found = fns_.find(source);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void DescriptorPoolRegistry::Register(
+    const string& source,
+    const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+  auto existing = Get(source);
+  CHECK_EQ(existing, nullptr)
+      << "descriptor pool for source: " << source << " already registered";
+  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c20e9e41337292bccf0c11c6c0b94a05e5df54
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class DescriptorPoolRegistry {
+ public:
+  typedef std::function<Status(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool)>
+      DescriptorPoolFn;
+
+  // Returns a pointer to a global DescriptorPoolRegistry object.
+  static DescriptorPoolRegistry* Global();
+
+  // Returns a pointer to a descriptor pool function for the given source.
+  DescriptorPoolFn* Get(const string& source);
+
+  // Registers a descriptor pool factory.
+  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+
+ private:
+  std::map<string, DescriptorPoolFn> fns_;
+};
+
+namespace descriptor_pool_registration {
+
+class DescriptorPoolRegistration {
+ public:
+  DescriptorPoolRegistration(
+      const string& source,
+      const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+    DescriptorPoolRegistry::Global()->Register(source, pool_fn);
+  }
+};
+
+}  // namespace descriptor_pool_registration
+
+#define REGISTER_DESCRIPTOR_POOL(source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(__COUNTER__, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(ctr, source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)       \
+  static descriptor_pool_registration::DescriptorPoolRegistration \
+      descriptor_pool_registration_fn_##ctr(source, pool_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry_test.cc b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6899998ab57a2c90458db31596dc6bf00e8adc0
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("TEST POOL 1", Value::Function);
+REGISTER_DESCRIPTOR_POOL("TEST POOL 2", Value::Function);
+}  // namespace
+
+TEST(DescriptorPoolRegistryTest, TestBasic) {
+  EXPECT_EQ(DescriptorPoolRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto pool1 = DescriptorPoolRegistry::Global()->Get("TEST POOL 1");
+  EXPECT_NE(pool1, nullptr);
+  auto pool2 = DescriptorPoolRegistry::Global()->Get("TEST POOL 2");
+  EXPECT_NE(pool2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..271c85efd88de0f8acbedb3d2254af3397601c6b
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+// Build a `DescriptorPool` from the named file or URI. The file or URI
+// must be available to the current TensorFlow environment.
+//
+// The file must contiain a serialized `FileDescriptorSet`. See
+// `GetDescriptorPool()` for more information.
+Status GetDescriptorPoolFromFile(
+    tensorflow::Env* env, const string& filename,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  Status st = env->FileExists(filename);
+  if (!st.ok()) {
+    return st;
+  }
+
+  // Read and parse the FileDescriptorSet.
+  tensorflow::protobuf::FileDescriptorSet descs;
+  std::unique_ptr<tensorflow::ReadOnlyMemoryRegion> buf;
+  st = env->NewReadOnlyMemoryRegionFromFile(filename, &buf);
+  if (!st.ok()) {
+    return st;
+  }
+  if (!descs.ParseFromArray(buf->data(), buf->length())) {
+    return errors::InvalidArgument(
+        "descriptor_source contains invalid FileDescriptorSet: ", filename);
+  }
+
+  // Build a DescriptorPool from the FileDescriptorSet.
+  owned_desc_pool->reset(new tensorflow::protobuf::DescriptorPool());
+  for (const auto& filedesc : descs.file()) {
+    if ((*owned_desc_pool)->BuildFile(filedesc) == nullptr) {
+      return errors::InvalidArgument(
+          "Problem loading FileDescriptorProto (missing dependencies?): ",
+          filename);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  // Attempt to lookup the pool in the registry.
+  auto pool_fn = DescriptorPoolRegistry::Global()->Get(descriptor_source);
+  if (pool_fn != nullptr) {
+    return (*pool_fn)(desc_pool, owned_desc_pool);
+  }
+
+  // If there is no pool function registered for the given source, let the
+  // runtime find the file or URL.
+  Status status =
+      GetDescriptorPoolFromFile(env, descriptor_source, owned_desc_pool);
+  if (status.ok()) {
+    *desc_pool = owned_desc_pool->get();
+  }
+  *desc_pool = owned_desc_pool->get();
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..92ee8997ab28f151a7b15b0d81628988e98159f4
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+class Env;
+class Status;
+
+// Get a `DescriptorPool` object from the named `descriptor_source`.
+// `descriptor_source` may be a path to a file accessible to TensorFlow, in
+// which case it is parsed as a `FileDescriptorSet` and used to build the
+// `DescriptorPool`.
+//
+// `owned_desc_pool` will be filled in with the same pointer as `desc_pool` if
+// the caller should take ownership.
+extern tensorflow::Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
diff --git a/tensorflow/core/util/proto/local_descriptor_pool_registration.cc b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48fe0102d011724a91004ff3297e07259df87c27
--- /dev/null
+++ b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+namespace {
+
+struct LocalDescriptorPool {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    *desc_pool = ::tensorflow::protobuf::DescriptorPool::generated_pool();
+    if (*desc_pool == nullptr) {
+      return errors::InvalidArgument("Problem loading protobuf generated_pool");
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("", LocalDescriptorPool::Function);
+REGISTER_DESCRIPTOR_POOL("local://", LocalDescriptorPool::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index ee38f81f3e1b370b3f10d3998dee5b3a6d916e7d..a595c9509e66c7bb3ac7fadbd0e87cfd81d1d611 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -47,6 +47,18 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
   return Status::OK();
 }
 
+Status TestReporter::SetProperty(const string& name, const string& value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
+  return Status::OK();
+}
+
+Status TestReporter::SetProperty(const string& name, double value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() {
   if (fname_.empty()) {
     return Status::OK();
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index bcae12204ec369af74d748e5e73d06616309f289..e551e2e4f57decff586fc0bd4a8514ca7af8e0ec 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -34,11 +34,13 @@ namespace tensorflow {
 //
 // If this environment variable is not defined, no logging is performed.
 //
-// The intended use is via the following 4 lines:
+// The intended use is via the following lines:
 //
 //  TestReporter reporter(test_name);
 //  TF_CHECK_OK(reporter.Initialize()));
 //  TF_CHECK_OK(reporter.Benchmark(iters, cpu_time, wall_time, throughput));
+//  TF_CHECK_OK(reporter.SetProperty("some_string_property", "some_value");
+//  TF_CHECK_OK(reporter.SetProperty("some_double_property", double_value);
 //  TF_CHECK_OK(reporter.Close());
 //
 // For example, if the environment variable
@@ -75,6 +77,12 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, double value);
+
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, const string& value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 90ea09876e85468fbc05a1baa79b29a7a42ebace..0972b86ea5fefa4b490ee61eeb1937b136783801 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -115,5 +115,28 @@ TEST(TestReporter, Benchmark) {
   EXPECT_EQ(benchmark_entry.throughput(), 3.0);
 }
 
+TEST(TestReporter, SetProperties) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b2/3/4");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
+  TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b2__3__4");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& extras = benchmark_entry.extras();
+  ASSERT_EQ(2, extras.size());
+  EXPECT_EQ("abc", extras.at("string_prop").string_value());
+  EXPECT_EQ(4.0, extras.at("double_prop").double_value());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/BUILD b/tensorflow/core/util/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f0f161ecc017966cef1a59e6870b016bdfb8d3ec
--- /dev/null
+++ b/tensorflow/core/util/rpc/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "call_container",
+    hdrs = ["call_container.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory",
+    srcs = ["rpc_factory.cc"],
+    hdrs = ["rpc_factory.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory_registry",
+    srcs = ["rpc_factory_registry.cc"],
+    hdrs = ["rpc_factory_registry.h"],
+    deps = [
+        ":rpc_factory",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "rpc_factory_registry_test",
+    srcs = ["rpc_factory_registry_test.cc"],
+    deps = [
+        ":rpc_factory_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f3605679759b93d3eeef4e2919f905781317707
--- /dev/null
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+#define TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+
+#include <list>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+namespace tensorflow {
+
+template <typename Call>
+class CallContainer {
+ public:
+  explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast,
+                         bool try_rpc, AsyncOpKernel::DoneCallback done,
+                         CancellationToken token)
+      : ctx_(ctx),
+        done_(std::move(done)),
+        token_(token),
+        fail_fast_(fail_fast),
+        try_rpc_(try_rpc) {
+    CHECK_GT(num_calls, 0);
+
+    // This will run when all RPCs are finished.
+    reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
+      ctx_->cancellation_manager()->DeregisterCallback(token_);
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    });
+
+    // Subtract reference count from the initial creation.
+    core::ScopedUnref unref(reffed_status_callback_);
+
+    for (int i = 0; i < num_calls; ++i) {
+      // Increase the reference on the callback for each new RPC.
+      reffed_status_callback_->Ref();
+    }
+  }
+
+  std::list<Call>* calls() { return &calls_; }
+
+  void StartCancel() {
+    // Once this loop is done, can no longer assume anything is valid
+    // because "delete this" may have been immediately called.
+    // Nothing should run after this loop.
+    for (auto& call : calls_) {
+      call.StartCancel();
+    }
+  }
+
+  void Done(const Status& s, int index) {
+    if (!try_rpc_) {
+      reffed_status_callback_->UpdateStatus(s);
+    }
+    reffed_status_callback_->Unref();
+  }
+
+ private:
+  OpKernelContext* ctx_;
+  std::list<Call> calls_;
+  const AsyncOpKernel::DoneCallback done_;
+  const CancellationToken token_;
+  const bool fail_fast_;
+  const bool try_rpc_;
+
+  // Performs its own reference counting.
+  ReffedStatusCallback* reffed_status_callback_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory.cc b/tensorflow/core/util/rpc/rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8530f02b6e2e021ed1c01db9a5bf25f5789a1142
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/numbers.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+template <>
+bool GetEnvVar(const char* key, const string& default_value, string* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+  } else {
+    *value = env_value;
+  }
+  return true;
+}
+
+template <>
+bool GetEnvVar(const char* key, const int64& default_value, int64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strto64(env_value, value);
+}
+
+template <>
+bool GetEnvVar(const char* key, const uint64& default_value, uint64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strtou64(env_value, value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf078c0f4a33e9d32c57357cc0b176aedd4f52f
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+// Return the environment variable `key`.  If the variable is not set,
+// use the default value.  If it is set but could not be parsed,
+// return `false`.  Otherwise set `value` and return `true`.
+template <typename T>
+bool GetEnvVar(const char* key, const T& default_value, T* value);
+
+class RPCFactory {
+ public:
+  RPCFactory() {}
+  virtual ~RPCFactory() {}
+
+  // Start a Call() to methods `method_t` at addresses `address_t` with
+  // request strings from `request_t`.  Any of these may be scalar
+  // Tensors, in which case the operands are broadcasted.
+  // Upon completion of all requests, `response_t` will be populated.
+  //
+  // If `try_rpc` is `true`, then `status_message_t` and
+  // `status_code_t` will be populated as well.
+  //
+  // If `try_rpc` is `false`, then `status_message_t` and
+  // `status_code_t` are ignored (and may be nullptr).  Instead, the
+  // status of any failed call will be propagated to the op.
+  //
+  // REQUIRES:
+  //   - `response_t` is not null, and is a string Tensor with the same shape as
+  //     `request_t`.
+  //
+  //   If `try_rpc` is `true`:
+  //      - `status_code_t` and `status_message_t` are not null.
+  //      - `status_code_t` is an int32 Tensor with the same shape as
+  //        `request_t`.
+  //      - `status_message_t` is a string Tensor with the same shape as
+  //        `request_t`.
+  virtual void Call(OpKernelContext* ctx, int64 num_elements,
+                    const Tensor& address_t, const Tensor& method_t,
+                    const Tensor& request_t, const bool try_rpc,
+                    Tensor* response_t, Tensor* status_code_t,
+                    Tensor* status_message_t,
+                    AsyncOpKernel::DoneCallback done) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RPCFactory);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.cc b/tensorflow/core/util/rpc/rpc_factory_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a148b5c04d0dbe551dd11d001f6434b23e99714f
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+RPCFactoryRegistry* RPCFactoryRegistry::Global() {
+  static RPCFactoryRegistry* registry = new RPCFactoryRegistry;
+  return registry;
+}
+
+RPCFactoryRegistry::RPCFactoryFn* RPCFactoryRegistry::Get(
+    const string& protocol) {
+  auto found = fns_.find(protocol);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void RPCFactoryRegistry::Register(const string& protocol,
+                                  const RPCFactoryFn& factory_fn) {
+  auto existing = Get(protocol);
+  CHECK_EQ(existing, nullptr)
+      << "RPC factory for protocol: " << protocol << " already registered";
+  fns_.insert(std::pair<const string&, RPCFactoryFn>(protocol, factory_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.h b/tensorflow/core/util/rpc/rpc_factory_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2635a4012e8f243c8d4334ad3477e184e8cd53a2
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class RPCFactoryRegistry {
+ public:
+  typedef std::function<RPCFactory*(OpKernelConstruction* ctx, bool fail_fast,
+                                    int64 timeout_in_ms)>
+      RPCFactoryFn;
+
+  // Returns a pointer to a global RPCFactoryRegistry object.
+  static RPCFactoryRegistry* Global();
+
+  // Returns a pointer to an function that creates an RPC factory for the given
+  // protocol.
+  RPCFactoryFn* Get(const string& protocol);
+
+  // Registers a function that creates and RPC factory for the given protocol.
+  // The function should transfer the ownership of the factory to its caller.
+  void Register(const string& protocol, const RPCFactoryFn& factory_fn);
+
+ private:
+  std::map<string, RPCFactoryFn> fns_;
+};
+
+namespace rpc_factory_registration {
+
+class RPCFactoryRegistration {
+ public:
+  RPCFactoryRegistration(const string& protocol,
+                         const RPCFactoryRegistry::RPCFactoryFn& factory_fn) {
+    RPCFactoryRegistry::Global()->Register(protocol, factory_fn);
+  }
+};
+
+}  // namespace rpc_factory_registration
+
+#define REGISTER_RPC_FACTORY(protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ_HELPER(__COUNTER__, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ_HELPER(ctr, protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn) \
+  static rpc_factory_registration::RPCFactoryRegistration    \
+      rpc_factory_registration_fn_##ctr(protocol, factory_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry_test.cc b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfd0f95016ed344924c9366bf43ff0ccb47e548c
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return nullptr;
+  }
+};
+
+REGISTER_RPC_FACTORY("TEST FACTORY 1", Value::Function);
+REGISTER_RPC_FACTORY("TEST FACTORY 2", Value::Function);
+}  // namespace
+
+TEST(RPCFactoryRegistryTest, TestBasic) {
+  EXPECT_EQ(RPCFactoryRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto factory1 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 1");
+  EXPECT_NE(factory1, nullptr);
+  auto factory2 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 2");
+  EXPECT_NE(factory2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 38f84286347622d1de0646cdc621d5fb1447e588..ef3b030e3277c1ff82b15949f0733ea78d3d0f10 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,22 +1,13 @@
 # How to run TensorFlow on S3
 
-This document describes how to run TensorFlow on S3 file system.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
-## S3
+This document guides you through the required setup, and provides examples on usage.
 
-We assume that you are familiar with @{$reading_data$reading data}.
-
-To use S3 with TensorFlow, change the file paths you use to read and write
-data to an S3 path. For example:
-
-```python
-filenames = ["s3://bucketname/path/to/file1.tfrecord",
-             "s3://bucketname/path/to/file2.tfrecord"]
-dataset = tf.data.TFRecordDataset(filenames)
-```
+## Configuration
 
 When reading or writing data on S3 with your TensorFlow program, the behavior
-could be controlled by various environmental variables:
+can be controlled by various environmental variables:
 
 *   **AWS_REGION**: By default, regional endpoint is used for S3, with region
     controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
@@ -28,7 +19,7 @@ could be controlled by various environmental variables:
 *   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
     with `S3_VERIFY_SSL=0`.
 
-To read or write objects in a bucket that is no publicly accessible,
+To read or write objects in a bucket that is not publicly accessible,
 AWS credentials must be provided through one of the following methods:
 
 *   Set credentials in the AWS credentials profile file on the local system,
@@ -38,3 +29,65 @@ AWS credentials must be provided through one of the following methods:
     variables.
 *   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
     give the EC2 instance access to that role.
+
+## Example Setup
+
+Using the above information, we can configure Tensorflow to communicate to an S3 endpoint by setting the following environment variables:
+
+```bash
+AWS_ACCESS_KEY_ID=XXXXX                 # Credentials only needed if connecting to a private endpoint
+AWS_SECRET_ACCESS_KEY=XXXXX
+AWS_REGION=us-east-1                    # Region for the S3 bucket, this is not always needed. Default is us-east-1.
+S3_ENDPOINT=s3.us-east-1.amazonaws.com  # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format.
+S3_USE_HTTPS=1                          # Whether or not to use HTTPS. Disable with 0.
+S3_VERIFY_SSL=1                         # If HTTPS is used, conterols if SSL should be enabled. Disable with 0.
+```
+
+## Usage
+
+Once setup is completed, Tensorflow can interact with S3 in a variety of ways. Anywhere there is a Tensorflow IO function, an S3 URL can be used.
+
+### Smoke Test
+
+To test your setup, stat a file:
+
+```python
+from tensorflow.python.lib.io import file_io
+print file_io.stat('s3://bucketname/path/')
+```
+
+You should see output similar to this:
+
+```console
+<tensorflow.python.pywrap_tensorflow_internal.FileStatistics; proxy of <Swig Object of type 'tensorflow::FileStatistics *' at 0x10c2171b0> >
+```
+
+### Reading Data
+
+When @{$reading_data$reading data}, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+### Tensorflow Tools
+
+Many Tensorflow tools, such as Tensorboard or model serving, can also take S3 URLS as arguments:
+
+```bash
+tensorboard --logdir s3://bucketname/path/to/model/
+tensorflow_model_server --port=9000 --model_name=model --model_base_path=s3://bucketname/path/to/model/export/
+```
+
+This enables an end to end workflow using S3 for all data needs.
+
+## S3 Endpoint Implementations
+
+S3 was invented by Amazon, but the S3 API has spread in popularity and has several implementations. The following implementations have passed basic compatibility tests:
+
+* [Amazon S3](https://aws.amazon.com/s3/)
+* [Google Storage](https://cloud.google.com/storage/docs/interoperability)
+* [Minio](https://www.minio.io/kubernetes.html)(Standalone mode only)
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15075e1df8e703415b4acb8e53f76dc9a4a41b50..84da2165b599cda64b5f2ae2ceecfb9ac1934a42 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -530,56 +530,58 @@ form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("preserve_index: int")
+    .Input("to_zero: int32")
     .Output("zeroed: int32");
-</code></pre>
+```
 
 (Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preserve_index", &preserve_index_));
+    // Check that preserve_index is positive
+    OP_REQUIRES(context, preserve_index_ >= 0,
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
+                                        preserve_index_));
+  }
+  void Compute(OpKernelContext* context) override {
     // ...
   }
- <b>private:
-  int preserve\_index\_;</b>
+ private:
+  int preserve_index_;
 };
-</code></pre>
+```
 
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
-  void Compute(OpKernelContext\* context) override {
+```c++
+  void Compute(OpKernelContext* context) override {
     // ...
-<br/>
-    <b>// We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
+
+    // We're using saved attr to validate potentially dynamic input
+    // So we check that preserve_index is in range
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
+                errors::InvalidArgument("preserve_index out of range"));
+
+    // Set all the elements of the output tensor to 0
     const int N = input.size();
     for (int i = 0; i < N; i++) {
       output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    }
+
+    // Preserve the requested input value
+    output_flat(preserve_index_) = input(preserve_index_);
   }
-</code></pre>
+```
 
 #### Attr types
 
@@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre class="prettyprint"><code class="lang-cpp">
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+class ZeroOutInt32Op : public OpKernel {
   // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
+};
+
+class ZeroOutFloatOp : public OpKernel {
  public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<float>();
+
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<float>();
+
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOpInt32);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
     ZeroOutFloatOp);
-</b></code></pre>
+```
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
+> ```c++
+> REGISTER_OP("ZeroOut")
+>   .Attr("T: {float, int32} = DT_INT32")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, double, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index b9fd72978dd11046e5347b9bce2bddd345ca426b..9a968d365be15e087482c9dcf555b8c128a3e21d 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -112,11 +112,11 @@ There are a few ways to get a list of the `OpDef`s for the registered ops:
     to interpret the `OpDef` messages.
 -   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
     list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator
     in C++ (particularly useful for languages that do not have protocol buffer
     support).
 -   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+    [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process.
 
 The `OpDef` specifies the following:
 
@@ -159,7 +159,7 @@ between the generated code and the `OpDef`s checked into the repository, but is
 useful for languages where code is expected to be generated ahead of time like
 `go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
 some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
+[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt).
 
 #### Handling Constants
 
@@ -229,6 +229,3 @@ and "while") is not available in languages other than Python. This will be
 updated when the [C API] provides necessary support.
 
 [C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 10e717c280f09c4f1bdfea9d0a2c8d3a00191734..2c33a6b6f7e5f1faf04d38e95b74d184134a1edf 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -1,4 +1,4 @@
-# Custom Data Readers
+# Reading custom file and record formats
 
 PREREQUISITES:
 
@@ -9,187 +9,273 @@ PREREQUISITES:
 
 We divide the task of supporting a file format into two pieces:
 
-*   File formats: We use a *Reader* Op to read a *record* (which can be any
-    string) from a file.
-*   Record formats: We use decoder or parsing Ops to turn a string record
+*   File formats: We use a reader `tf.data.Dataset` to read raw *records* (which
+    are typically represented by scalar string tensors, but can have more
+    structure) from a file.
+*   Record formats: We use decoder or parsing ops to turn a string record
     into tensors usable by TensorFlow.
 
 For example, to read a
 [CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-@{tf.TextLineReader$a Reader for text files}
-followed by
-@{tf.decode_csv$an Op that parses CSV data from a line of text}.
+@{tf.data.TextLineDataset$a dataset for reading text files line-by-line}
+and then @{tf.data.Dataset.map$map} an
+@{tf.decode_csv$op} that parses CSV data from each line of text in the dataset.
 
 [TOC]
 
-## Writing a Reader for a file format
+## Writing a `Dataset` for a file format
 
-A `Reader` is something that reads records from a file.  There are some examples
-of Reader Ops already built into TensorFlow:
+A @{tf.data.Dataset} represents a sequence of *elements*, which can be the
+individual records in a file. There are several examples of "reader" datasets
+that are already built into TensorFlow:
 
-*   @{tf.TFRecordReader}
-    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
-*   @{tf.FixedLengthRecordReader}
-    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
-*   @{tf.TextLineReader}
-    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
+*   @{tf.data.TFRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.FixedLengthRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.TextLineDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
 
-You can see these all expose the same interface, the only differences
-are in their constructors.  The most important method is `read`.
-It takes a queue argument, which is where it gets filenames to
-read from whenever it needs one (e.g. when the `read` op first runs, or
-the previous `read` reads the last record from a file).  It produces
-two scalar tensors: a string key and a string value.
+Each of these implementations comprises three related classes:
 
-To create a new reader called `SomeReader`, you will need to:
+* A `tensorflow::DatasetOpKernel` subclass (e.g. `TextLineDatasetOp`), which
+  tells TensorFlow how to construct a dataset object from the inputs to and
+  attrs of an op, in its `MakeDataset()` method.
 
-1.  In C++, define a subclass of
-    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h)
-    called `SomeReader`.
-2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
-3.  In Python, define a subclass of @{tf.ReaderBase} called `SomeReader`.
+* A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
+  which represents the *immutable* definition of the dataset itself, and tells
+  TensorFlow how to construct an iterator object over that dataset, in its
+  `MakeIterator()` method.
 
-You can put all the C++ code in a file in
-`tensorflow/core/user_ops/some_reader_op.cc`. The code to read a file will live
-in a descendant of the C++ `ReaderBase` class, which is defined in
-[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h).
-You will need to implement the following methods:
+* A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
+  `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
+  of an iterator over a particular dataset, and tells TensorFlow how to get the
+  next element from the iterator, in its `GetNextInternal()` method.
 
-*   `OnWorkStartedLocked`: open the next file
-*   `ReadLocked`: read a record or report EOF/error
-*   `OnWorkFinishedLocked`: close the current file, and
-*   `ResetLocked`: get a clean slate after, e.g., an error
+The most important method is the `GetNextInternal()` method, since it defines
+how to actually read records from the file and represent them as one or more
+`Tensor` objects.
 
-These methods have names ending in "Locked" since `ReaderBase` makes sure
-to acquire a mutex before calling any of these methods, so you generally don't
-have to worry about thread safety (though that only protects the members of the
-class, not global state).
+To create a new reader dataset called (for example) `MyReaderDataset`, you will
+need to:
 
-For `OnWorkStartedLocked`, the name of the file to open is the value returned by
-the `current_work()` method.  `ReadLocked` has this signature:
+1. In C++, define subclasses of `tensorflow::DatasetOpKernel`,
+   `tensorflow::GraphDatasetBase`, and `tensorflow::DatasetIterator<Dataset>`
+   that implement the reading logic.
+2. In C++, register a new reader op and kernel with the name
+   `"MyReaderDataset"`.
+3. In Python, define a subclass of @{tf.data.Dataset} called `MyReaderDataset`.
 
-```c++
-Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
-```
-
-If `ReadLocked` successfully reads a record from the file, it should fill in:
-
-*   `*key`: with an identifier for the record, that a human could use to find
-    this record again.  You can include the filename from `current_work()`,
-    and append a record number or whatever.
-*   `*value`: with the contents of the record.
-*   `*produced`: set to `true`.
-
-If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
-return `Status::OK()`.  If there is an error, simply return it using one of the
-helper functions from
-[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
-without modifying any arguments.
-
-Next you will create the actual Reader op.  It will help if you are familiar
-with @{$adding_an_op$the adding an op how-to}.  The main steps
-are:
-
-*   Registering the op.
-*   Define and register an `OpKernel`.
-
-To register the op, you will use a `REGISTER_OP` call defined in
-[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
-Reader ops never take any input and always have a single output with type
-`resource`.  They should have string `container` and `shared_name` attrs.
-You may optionally define additional attrs
-for configuration or include documentation in a `Doc`.  For examples, see
-[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
-e.g.:
+You can put all the C++ code in a single file, such as
+`my_reader_dataset_op.cc`. It will help if you are
+familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+can be used as a starting point for your implementation:
 
 ```c++
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-REGISTER_OP("TextLineReader")
-    .Output("reader_handle: resource")
-    .Attr("skip_header_lines: int = 0")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-)doc");
-```
-
-To define an `OpKernel`, Readers can use the shortcut of descending from
-`ReaderOpKernel`, defined in
-[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
-and implement a constructor that calls `SetReaderFactory`.  After defining
-your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
-An example with no attrs:
+namespace tensorflow {
+namespace {
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TFRecordReaderOp : public ReaderOpKernel {
+class MyReaderDatasetOp : public DatasetOpKernel {
  public:
-  explicit TFRecordReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    Env* env = context->env();
-    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
-  }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
-                        TFRecordReaderOp);
-```
+  MyReaderDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    // Parse and validate any attrs that define the dataset using
+    // `ctx->GetAttr()`, and store them in member variables.
+  }
 
-An example with attrs:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Parse and validate any input tensors 0that define the dataset using
+    // `ctx->input()` or the utility function
+    // `ParseScalarArgument<T>(ctx, &arg)`.
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TextLineReaderOp : public ReaderOpKernel {
- public:
-  explicit TextLineReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    int skip_header_lines = -1;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("skip_header_lines", &skip_header_lines));
-    OP_REQUIRES(context, skip_header_lines >= 0,
-                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
-                                        skip_header_lines));
-    Env* env = context->env();
-    SetReaderFactory([this, skip_header_lines, env]() {
-      return new TextLineReader(name(), skip_header_lines, env);
-    });
+    // Create the dataset object, passing any (already-validated) arguments from
+    // attrs or input tensors.
+    *output = new Dataset(ctx);
   }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
-                        TextLineReaderOp);
-```
-
-The last step is to add the Python wrapper.  You can either do this by
-@{$adding_an_op#build_the_op_library$compiling a dynamic library}
-or, if you are building TensorFlow from source, adding to `user_ops.py`.
-For the latter, you will import `tensorflow.python.ops.io_ops` in
-[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
-and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
+    }
+
+    // Record structure: Each record is represented by a scalar string tensor.
+    //
+    // Dataset elements can have a fixed number of components of different
+    // types and shapes; replace the following two methods to customize this
+    // aspect of the dataset.
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+
+   protected:
+    // Optional: Implementation of `GraphDef` serialization for this dataset.
+    //
+    // Implement this method if you want to be able to save and restore
+    // instances of this dataset (and any iterators over it).
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // Construct nodes to represent any of the input tensors from this
+      // object's member variables using `b->AddScalar()` and `b->AddVector()`.
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      // Implementation of the reading logic.
+      //
+      // The example implementation in this file yields the string "MyReader!"
+      // ten times. In general there are three cases:
+      //
+      // 1. If an element is successfully read, store it as one or more tensors
+      //    in `*out_tensors`, set `*end_of_sequence = false` and return
+      //    `Status::OK()`.
+      // 2. If the end of input is reached, set `*end_of_sequence = true` and
+      //    return `Status::OK()`.
+      // 3. If an error occurs, return an error status using one of the helper
+      //    functions from "tensorflow/core/lib/core/errors.h".
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        // NOTE: `GetNextInternal()` may be called concurrently, so it is
+        // recommended that you protect the iterator state with a mutex.
+        mutex_lock l(mu_);
+        if (i_ < 10) {
+          // Create a scalar string tensor and add it to the output.
+          Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+          record_tensor.scalar<string>()() = "MyReader!";
+          out_tensors->emplace_back(std::move(record_tensor));
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     protected:
+      // Optional: Implementation of iterator state serialization for this
+      // iterator.
+      //
+      // Implement these two methods if you want to be able to save and restore
+      // instances of this iterator.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        return Status::OK();
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+    };
+  };
+};
 
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import common_shapes
-from tensorflow.python.ops import io_ops
+// Register the op definition for MyReaderDataset.
+//
+// Dataset ops always have a single output, of type `variant`, which represents
+// the constructed `Dataset` object.
+//
+// Add any attrs and input tensors that define the dataset here.
+REGISTER_OP("MyReaderDataset")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
-class SomeReader(io_ops.ReaderBase):
+// Register the kernel implementation for MyReaderDataset.
+REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(DEVICE_CPU),
+                        MyReaderDatasetOp);
 
-    def __init__(self, name=None):
-        rr = gen_user_ops.some_reader(name=name)
-        super(SomeReader, self).__init__(rr)
+}  // namespace
+}  // namespace tensorflow
+```
 
+The last step is to build the C++ code and add a Python wrapper. The easiest way
+to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
+library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+that subclasses @{tf.data.Dataset} to wrap it. An example Python program is
+given here:
 
-ops.NotDifferentiable("SomeReader")
+```python
+import tensorflow as tf
+
+# Assumes the file is in the current working directory.
+my_reader_dataset_module = tf.load_op_library("./my_reader_dataset_op.so")
+
+class MyReaderDataset(tf.data.Dataset):
+
+  def __init__(self):
+    super(MyReaderDataset, self).__init__()
+    # Create any input attrs or tensors as members of this class.
+
+  def _as_variant_tensor(self):
+    # Actually construct the graph node for the dataset op.
+    #
+    # This method will be invoked when you create an iterator on this dataset
+    # or a dataset derived from it.
+    return my_reader_dataset_module.my_reader_dataset()
+
+  # The following properties define the structure of each element: a scalar
+  # `tf.string` tensor. Change these properties to match the `output_dtypes()`
+  # and `output_shapes()` methods of `MyReaderDataset::Dataset` if you modify
+  # the structure of each element.
+  @property
+  def output_types(self):
+    return tf.string
+
+  @property
+  def output_shapes(self):
+    return tf.TensorShape([])
+
+  @property
+  def output_classes(self):
+    return tf.Tensor
+
+if __name__ == "__main__":
+  # Create a MyReaderDataset and print its elements.
+  with tf.Session() as sess:
+    iterator = MyReaderDataset().make_one_shot_iterator()
+    next_element = iterator.get_next()
+    try:
+      while True:
+        print(sess.run(next_element))  # Prints "MyReader!" ten times.
+    except tf.errors.OutOfRangeError:
+      pass
 ```
 
-You can see some examples in
-[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+You can see some examples of `Dataset` wrapper classes in
+[`tensorflow/python/data/ops/dataset_ops.py`](https://www.tensorflow.org/code/tensorflow/python/data/ops/dataset_ops.py).
 
 ## Writing an Op for a record format
 
@@ -201,9 +287,7 @@ track down where the bad data came from.
 
 Examples of Ops useful for decoding records:
 
-*   @{tf.parse_single_example}
-    (and
-    @{tf.parse_example})
+*   @{tf.parse_single_example} (and @{tf.parse_example})
 *   @{tf.decode_csv}
 *   @{tf.decode_raw}
 
@@ -211,11 +295,6 @@ Note that it can be useful to use multiple Ops to decode a particular record
 format.  For example, you may have an image saved as a string in
 [a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 Depending on the format of that image, you might take the corresponding output
-from a
-@{tf.parse_single_example}
-op and call @{tf.image.decode_jpeg},
-@{tf.image.decode_png}, or
-@{tf.decode_raw}.  It is common to
-take the output of `tf.decode_raw` and use
-@{tf.slice} and
-@{tf.reshape} to extract pieces.
+from a @{tf.parse_single_example} op and call @{tf.image.decode_jpeg},
+@{tf.image.decode_png}, or @{tf.decode_raw}.  It is common to take the output
+of `tf.decode_raw` and use @{tf.slice} and @{tf.reshape} to extract pieces.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 274413e29442d448d989e8574f3c8578ea9da5a0..995b8ae6663ed6694ce9051fe1c4577f077dcad1 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 1a0956634d6b03585a39164a2492df3fd1b4ffc7..2938a8f7eef8aaa4302cccb8e8006b6f9e2808ef 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index cdde45a6f4fb4fc93407bc882d7bc5c8c32fda46..05604d95c5efbc16bbcbd252d0ce1f629ba44da6 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0</version>
+                 <version>1.8.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -93,6 +93,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
               // Execute the "MyConst" operation in a Session.
               try (Session s = new Session(g);
+                   // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
                    Tensor output = s.runner().fetch("MyConst").run().get(0)) {
                 System.out.println(new String(output.bytesValue(), "UTF-8"));
               }
@@ -123,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0</version>
+  <version>1.8.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -207,6 +208,7 @@ public class HelloTF {
 
       // Execute the "MyConst" operation in a Session.
       try (Session s = new Session(g);
+           // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
            Tensor output = s.runner().fetch("MyConst").run().get(0)) {
         System.out.println(new String(output.bytesValue(), "UTF-8"));
       }
@@ -225,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 04e4242b0ffd476818f6e5c4522c60111a65e151..1a349f54120cf3adf59f52377ea62fc21c7e42c9 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -65,16 +65,38 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
+
   * **[OPTIONAL]**  For optimized inferencing performance, you can also install
-    NVIDIA TensorRT 3.0. For details, see
-    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
-    Only steps 1-4 in the TensorRT Tar File installation instructions are
-    required for compatibility with TensorFlow; the Python package installation
-    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+    **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed
+    for use with the pre-built `tensorflow-gpu` package can be installed as follows:
+
+    <pre>
+    $ <b>wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
+    $ <b>sudo apt-get update</b>
+    $ <b>sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</b>
+    </pre>
 
     **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
-    package, please use the Ubuntu **14.04** tar file package of TensorRT
-    even when installing onto an Ubuntu 16.04 system.   
+    package, please use the Ubuntu **14.04** package of TensorRT as shown above,
+    even when installing onto an Ubuntu 16.04 system.<br/>
+    <br/>
+    To build the TensorFlow-TensorRT integration module from source rather than
+    using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+    For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).<br/>
+    <br/>
+    To avoid cuDNN version conflicts during later system upgrades, you can hold
+    the cuDNN version at 7.0.5:
+
+    <pre>
+    $ <b> sudo apt-mark hold libcudnn7 libcudnn7-dev</b>
+    </pre>
+
+    To later allow upgrades, you can remove the hold:
+
+    <pre>
+    $ <b> sudo apt-mark unhold libcudnn7 libcudnn7-dev</b>
+    </pre>
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
@@ -194,7 +216,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -299,7 +321,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -485,7 +507,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -659,14 +681,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -678,14 +700,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -697,14 +719,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -716,14 +738,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index b3e9616a0592c43f457183e53c8e99e55f3f5d94..a237d1af5408c48eb50747d3a16ffb3461ff888e 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 7d7c2aa75aeef15d9b400f2bf5dddb083f387a2b..677e3329b6b5ec2d931e3d7d73d29e87a79f78a9 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0 on Linux:
+for TensorFlow 1.8.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -450,6 +450,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
@@ -471,6 +473,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -486,6 +489,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 08a5fbe41c87c88399682208c38bf7a892d8fc1a..c35530061dcaf2a4a894dcdf54fd794907d98162 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,7 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
+    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 580a899ac4e4f5c3d97ce023f25083168fe00d01..b1796cf9b2d0bf7459e70ab542b6e6fcb203667a 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
     for optimal settings.
 
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
-    and will result in the value being set to the number of logical cores, is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    physical cores is recommended. Setting the value to 0, which is the default,
+    results in the value being set to the number of logical cores - this is an
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
     should be equal.
 
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 411889cb1c616130f809e6228cc692ba3f951d48..2fea02d861d314cc61f2ba20475bf08ebea8fb5f 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -110,7 +110,7 @@ we've added a separate rewrite for the *eval graph*:
 
 ```
 # Build eval model
-logits = tf.nn.softmax_cross_entropy_with_logits(...)
+logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
 # Call the eval rewrite which rewrites the graph in-place with
 # FakeQuantization nodes and fold batchnorm for eval.
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 217ab596b72bc263ae5dda377a8faab8a39b0a3c..3963d5faa70dc663b8a9e6f12e7c2775b3d82a4a 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -854,12 +854,13 @@ calculation of 'start_indices') is currently implementation-defined.
 | `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
 | `update`        | `ComputationDataHandle` | N dimensional array of type T    |
 :                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
+:                 :                         : Each dimension of update shape   :
 :                 :                         : must be strictly greater than    :
 :                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
+:                 :                         : less than or equal to the operand:
+:                 :                         : size for each dimension to avoid :
+:                 :                         : generating out-of-bounds update  :
+:                 :                         : indices.                         :
 | `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
 :                 :                         : containing the starting indices  :
 :                 :                         : of the slice for each dimension. :
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index dc5b403428fed524abd2a793e695d11b9d63290e..595e6be4af78d7d684ddeca0adea59e5a754134d 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -102,11 +102,11 @@ print(a.numpy())
 #     [3 4]]
 ```
 
-The `tfe` module contains symbols available to both eager and graph execution
+The `tf.contrib.eager` module contains symbols available to both eager and graph execution
 environments and is useful for writing code to [work with graphs](#work_with_graphs):
 
 ```py
-import tensorflow.contrib.eager as tfe
+tfe = tf.contrib.eager
 ```
 
 ## Dynamic control flow
@@ -213,25 +213,25 @@ their objects.
 [Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
 is useful for implementing machine learning algorithms such as
 [backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks. During eager execution, use `tfe.GradientTape` to trace
+neural networks. During eager execution, use `tf.GradientTape` to trace
 operations for computing gradients later.
 
-`tfe.GradientTape` is an opt-in feature to provide maximal performance when
+`tf.GradientTape` is an opt-in feature to provide maximal performance when
 not tracing. Since different operations can occur during each call, all
 forward-pass operations get recorded to a "tape". To compute the gradient, play
-the tape backwards and then discard. A particular `tfe.GradientTape` can only
+the tape backwards and then discard. A particular `tf.GradientTape` can only
 compute one gradient; subsequent calls throw a runtime error.
 
 ```py
 w = tfe.Variable([[1.0]])
-with tfe.GradientTape() as tape:
+with tf.GradientTape() as tape:
   loss = w * w
 
 grad = tape.gradient(loss, [w])
 print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
 ```
 
-Here's an example of `tfe.GradientTape` that records forward-pass operations
+Here's an example of `tf.GradientTape` that records forward-pass operations
 to train a simple model:
 
 ```py
@@ -251,8 +251,8 @@ def loss(weights, biases):
 
 # Return the derivative of loss with respect to weight and bias
 def grad(weights, biases):
-  with tfe.GradientTape() as tape:
-    loss_value = loss(weights, biases) 
+  with tf.GradientTape() as tape:
+    loss_value = loss(weights, biases)
   return tape.gradient(loss_value, [weights, biases])
 
 train_steps = 200
@@ -292,7 +292,7 @@ Final loss: 0.974
 W = 3.01582956314, B = 2.1191945076
 ```
 
-Replay the `tfe.GradientTape` to compute the gradients and apply them in a
+Replay the `tf.GradientTape` to compute the gradients and apply them in a
 training loop. This is demonstrated in an excerpt from the
 [mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
 example:
@@ -301,9 +301,9 @@ example:
 dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
                                               data.train.labels))
 ...
-for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
+for (batch, (images, labels)) in enumerate(dataset):
   ...
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     logits = model(images, training=True)
     loss_value = loss(logits, labels)
   ...
@@ -353,17 +353,17 @@ def loss(model, x, y):
   return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
 
 def grad(model, inputs, targets):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     loss_value = loss(model, inputs, targets)
   return tape.gradient(loss_value, model.variables)
 
 optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
 
-x, y = tfe.Iterator(dataset_train).next()
+x, y = iter(dataset_train).next()
 print("Initial loss: {:.3f}".format(loss(model, x, y)))
 
 # Training loop
-for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+for (i, (x, y)) in enumerate(dataset_train):
   # Calculate derivatives of the input function with respect to its parameters.
   grads = grad(model, x, y)
   # Apply the gradient to the model
@@ -398,7 +398,7 @@ And for faster training, move the computation to a GPU:
 
 ```py
 with tf.device("/gpu:0"):
-  for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
+  for (i, (x, y)) in enumerate(dataset_train):
     # minimize() is equivalent to the grad() and apply_gradients() calls.
     optimizer.minimize(lambda: loss(model, x, y),
                        global_step=tf.train.get_or_create_global_step())
@@ -411,7 +411,7 @@ training to make automatic differentiation easier. The parameters of a model can
 be encapsulated in classes as variables.
 
 Better encapsulate model parameters by using `tfe.Variable` with
-`tfe.GradientTape`. For example, the automatic differentiation example above
+`tf.GradientTape`. For example, the automatic differentiation example above
 can be rewritten:
 
 ```py
@@ -435,7 +435,7 @@ def loss(model, inputs, targets):
   return tf.reduce_mean(tf.square(error))
 
 def grad(model, inputs, targets):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     loss_value = loss(model, inputs, targets)
   return tape.gradient(loss_value, [model.W, model.B])
 
@@ -585,14 +585,14 @@ for _ in range(iterations):
 
 ### Dynamic models
 
-`tfe.GradientTape` can also be used in dynamic models. This example for a
+`tf.GradientTape` can also be used in dynamic models. This example for a
 [backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
 algorithm looks like normal NumPy code, except there are gradients and is
 differentiable, despite the complex control flow:
 
 ```py
 def line_search_step(fn, init_x, rate=1.0):
-  with tfe.GradientTape() as tape:
+  with tf.GradientTape() as tape:
     # Variables are automatically recorded, but manually watch a tensor
     tape.watch(init_x)
     value = fn(init_x)
@@ -608,7 +608,7 @@ def line_search_step(fn, init_x, rate=1.0):
 
 ### Additional functions to compute gradients
 
-`tfe.GradientTape` is a powerful interface for computing gradients, but there
+`tf.GradientTape` is a powerful interface for computing gradients, but there
 is another [Autograd](https://github.com/HIPS/autograd)-style API available for
 automatic differentiation. These functions are useful if writing math code with
 only tensors and gradient functions, and without `tfe.Variables`:
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 392ac6f7f12532c3efce5bec1917691f55c7bee5..51c1a1e032baae7eff334da785fc5ffa2438e0ca 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -121,7 +121,7 @@ dimensions:
   devices, which makes it possible to speed up
   @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
 * The Session API allows multiple concurrent steps (i.e. calls to
-  @{tf.Session.run} in parallel. This
+  @{tf.Session.run} in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
   all of the resources in your computer.
 
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index e69b717432e6a8fab0085eb419dcbc0991cd9d28..aa72cae766c3641a2d447032b7dcea58b53ac173 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -96,7 +96,7 @@ to all API functions in the same context.  For example:
   (See @{$programmers_guide/variables} for more information about variables.)
 
 * Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
-  default graph that calculate gradients, and return a @{tf.Operation} that,
+  default graph that calculates gradients, and return a @{tf.Operation} that,
   when run, will apply those gradients to a set of variables.
 
 Most programs rely solely on the default graph. However,
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 017db0e8cb4d239fa4b6be6a5f9d6b0c582a82c2..648d001bd3535fe3dcc460c9ebdb6e6a997dc332 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,7 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
-  * @{$programmers_guide/eager}, which is the easiest way to use tensorflow.
+  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 55ee42dd6405db6bd34b064d71deaeb94839b0fa..c6ef87c54a3bc37dbfc0553232a8e3d30f8ee2f6 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -485,31 +485,7 @@ portion of the signature.  That is, when writing a
 to expect and how to map them to your model's expected inputs.
 By contrast, the *output* portion of the signature is determined by the model.
 
-
-### Perform the export
-
-To export your trained Estimator, call
-@{tf.estimator.Estimator.export_savedmodel} with the export base path and
-the `serving_input_receiver_fn`.
-
-```py
-estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                            strip_default_attrs=True)
-```
-
-This method builds a new graph by first calling the
-`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
-this `Estimator`'s `model_fn()` to generate the model graph based on those
-features. It starts a fresh `Session`, and, by default, restores the most recent
-checkpoint into it.  (A different checkpoint may be passed, if needed.)
-Finally it creates a time-stamped export directory below the given
-`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
-SavedModel into it containing a single `MetaGraphDef` saved from this
-Session.
-
-> Note: It is your responsibility to garbage-collect old exports.
-> Otherwise, successive exports will accumulate under `export_dir_base`.
-
+<a name="specify_outputs"></a>
 ### Specify the outputs of a custom model
 
 When writing a custom `model_fn`, you must populate the `export_outputs` element
@@ -541,6 +517,30 @@ using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tens
 indicating which `SignatureDef` will be served when an inference request
 does not specify one.
 
+<a name="perform_export"></a>
+### Perform the export
+
+To export your trained Estimator, call
+@{tf.estimator.Estimator.export_savedmodel} with the export base path and
+the `serving_input_receiver_fn`.
+
+```py
+estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                            strip_default_attrs=True)
+```
+
+This method builds a new graph by first calling the
+`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
+this `Estimator`'s `model_fn()` to generate the model graph based on those
+features. It starts a fresh `Session`, and, by default, restores the most recent
+checkpoint into it.  (A different checkpoint may be passed, if needed.)
+Finally it creates a time-stamped export directory below the given
+`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
+SavedModel into it containing a single `MetaGraphDef` saved from this
+Session.
+
+> Note: It is your responsibility to garbage-collect old exports.
+> Otherwise, successive exports will accumulate under `export_dir_base`.
 
 ### Serve the exported model locally
 
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index cb0d86fc4c54ac690f13c93ebbd10805c7738c62..5e3e49d43402cd76f8b7062483259df4598bd8ff 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -280,8 +280,8 @@ Where `params['batch-size']` will contain the batch size.
 ### Static shapes and batch size
 
 The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
-free strict static shape requirements imposed by the XLA/TPU environment. The
-one requirement is that the batches of data fed from your input pipeline to
+free from the strict static shape requirements imposed by the XLA/TPU environment.
+The one requirement is that the batches of data fed from your input pipeline to
 the TPU have a static shape, as determined by the standard TensorFlow shape
 inference algorithm. Intermediate tensors are free to have a dynamic shapes.
 If shape inference has failed, but the shape is known it is possible to
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 7d79f433c41b42a268816d8277ea69b0d62a04f3..372ab47df7df309ab926836ca19f34d2d0d38915 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -280,7 +280,7 @@ tool:
 ```
 bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \
 --input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \
---output_png=/tmp/spectrogram.png
+--output_image=/tmp/spectrogram.png
 ```
 
 If you open up `/tmp/spectrogram.png` you should see something like this:
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 93d7c86e42aa90d145d27b56edc0abfec7034686..27784eef9cdb5c6f8b9af44b3fc3f876cda39d13 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -1,404 +1,4 @@
 # How to Retrain Inception's Final Layer for New Categories
 
-Modern object recognition models have millions of parameters and can take weeks
-to fully train. Transfer learning is a technique that shortcuts a lot of this
-work by taking a fully-trained model for a set of categories like ImageNet, and
-retrains from the existing weights for new classes. In this example we'll be
-retraining the final layer from scratch, while leaving all the others untouched.
-For more information on the approach you can see
-[this paper on Decaf](https://arxiv.org/pdf/1310.1531v1.pdf).
-
-Though it's not as good as a full training run, this is surprisingly effective
-for many applications, and can be run in as little as thirty minutes on a
-laptop, without requiring a GPU. This tutorial will show you how to run the
-example script on your own images, and will explain some of the options you have
-to help control the training process.
-
-Note: A version of this tutorial is also available
-[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
-
-Before you start, you must @{$install$install tensorflow}.
-
-[TOC]
-
-## Training on Flowers
-
-![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
-
-[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
-
-Before you start any training, you'll need a set of images to teach the network
-about the new classes you want to recognize. There's a later section that
-explains how to prepare your own images, but to make it easy we've created an
-archive of creative-commons licensed flower photos to use initially. To get the
-set of flower photos, run these commands:
-
-```sh
-cd ~
-curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
-tar xzf flower_photos.tgz
-```
-
-Once you have the images, you can clone the tensorflow repository using the
-following command (these examples are not included in the installation):
-
-```sh
-git clone https://github.com/tensorflow/tensorflow
-```
-
-Then checkout the version of the tensorflow repository matching your
-installation and this tutorial as follows:
-
-``` sh
-cd tensorflow
-git checkout {version}
-```
-
-In the simplest cases the retrainer can then be run like this:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py --image_dir ~/flower_photos
-```
-
-The script has many other options. You can get a full listing with:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py -h
-```
-
-This script loads the pre-trained Inception v3 model, removes the old top layer,
-and trains a new one on the flower photos you've downloaded. None of the flower
-species were in the original ImageNet classes the full network was trained on.
-The magic of transfer learning is that lower layers that have been trained to
-distinguish between some objects can be reused for many recognition tasks
-without any alteration.
-
-## Bottlenecks
-
-The script can take thirty minutes or more to complete, depending on the speed
-of your machine. The first phase analyzes all the images on disk and calculates
-the bottleneck values for each of them. 'Bottleneck' is an informal term we
-often use for the layer just before the final output layer that actually does
-the classification. This penultimate layer has been trained to output a set of
-values that's good enough for the classifier to use to distinguish between all
-the classes it's been asked to recognize. That means it has to be a meaningful
-and compact summary of the images, since it has to contain enough information
-for the classifier to make a good choice in a very small set of values. The
-reason our final layer retraining can work on new classes is that it turns out
-the kind of information needed to distinguish between all the 1,000 classes in
-ImageNet is often also useful to distinguish between new kinds of objects.
-
-Because every image is reused multiple times during training and calculating
-each bottleneck takes a significant amount of time, it speeds things up to
-cache these bottleneck values on disk so they don't have to be repeatedly
-recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
-if you rerun the script they'll be reused so you don't have to wait for this
-part again.
-
-## Training
-
-Once the bottlenecks are complete, the actual training of the top layer of the
-network begins. You'll see a series of step outputs, each one showing training
-accuracy, validation accuracy, and the cross entropy. The training accuracy
-shows what percent of the images used in the current training batch were
-labeled with the correct class. The validation accuracy is the precision on a
-randomly-selected group of images from a different set. The key difference is
-that the training accuracy is based on images that the network has been able
-to learn from so the network can overfit to the noise in the training data. A
-true measure of the performance of the network is to measure its performance on
-a data set not contained in the training data -- this is measured by the
-validation accuracy. If the train accuracy is high but the validation accuracy
-remains low, that means the network is overfitting and memorizing particular
-features in the training images that aren't helpful more generally. Cross
-entropy is a loss function which gives a glimpse into how well the learning
-process is progressing. The training's objective is to make the loss as small as
-possible, so you can tell if the learning is working by keeping an eye on
-whether the loss keeps trending downwards, ignoring the short-term noise.
-
-By default this script will run 4,000 training steps. Each step chooses 100
-images at random from the training set, finds their bottlenecks from the cache,
-and feeds them into the final layer to get predictions. Those predictions are
-then compared against the actual labels to update the final layer's weights
-through the back-propagation process. As the process continues you should see
-the reported accuracy improve, and after all the steps are done, a final test
-accuracy evaluation is run on a set of images kept separate from the training
-and validation pictures. This test evaluation is the best estimate of how the
-trained model will perform on the classification task. You should see an
-accuracy value of between 90% and 95%, though the exact value will vary from run
-to run since there's randomness in the training process. This number is based on
-the percent of the images in the test set that are given the correct label
-after the model is fully trained.
-
-## Visualizing the Retraining with TensorBoard
-
-The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
-
-To launch TensorBoard, run this command during or after retraining:
-
-```sh
-tensorboard --logdir /tmp/retrain_logs
-```
-
-Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
-
-The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-
-The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
-
-## Using the Retrained Model
-
-The script will write out a version of the Inception v3 network with a final
-layer retrained to your categories to /tmp/output_graph.pb, and a text file
-containing the labels to /tmp/output_labels.txt. These are both in a format that
-the @{$image_recognition$C++ and Python image classification examples}
-can read in, so you can start using your new model immediately. Since you've
-replaced the top layer, you will need to specify the new name in the script, for
-example with the flag `--output_layer=final_result` if you're using label_image.
-
-Here's an example of how to run the label_image example with your
-retrained graphs:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=Mul \
---output_layer=final_result \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-You should see a list of flower labels, in most cases with daisy on top
-(though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out.
-
-If you'd like to use the retrained model in your own Python program, then the
-above
-[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/label_image/label_image.py)
-is a reasonable starting point. The `label_image`
-directory also contains C++ code which you can use as a template to integrate
-tensorflow with your own applications.
-
-If you find the default Inception v3 model is too large or slow for your
-application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
-below for options to speed up and slim down your network.
-
-## Training on Your Own Categories
-
-If you've managed to get the script working on the flower example images, you
-can start looking at teaching it to recognize categories you care about instead.
-In theory all you'll need to do is point it at a set of sub-folders, each named
-after one of your categories and containing only images from that category. If
-you do that and pass the root folder of the subdirectories as the argument to
-`--image_dir`, the script should train just like it did for the flowers.
-
-Here's what the folder structure of the flowers archive looks like, to give you
-and example of the kind of layout the script is looking for:
-
-![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
-
-In practice it may take some work to get the accuracy you want. I'll try to
-guide you through some of the common problems you might encounter below.
-
-## Creating a Set of Training Images
-
-The first place to start is by looking at the images you've gathered, since the
-most common issues we see with training come from the data that's being fed in.
-
-For training to work well, you should gather at least a hundred photos of each
-kind of object you want to recognize. The more you can gather, the better the
-accuracy of your trained model is likely to be. You also need to make sure that
-the photos are a good representation of what your application will actually
-encounter. For example, if you take all your photos indoors against a blank wall
-and your users are trying to recognize objects outdoors, you probably won't see
-good results when you deploy.
-
-Another pitfall to avoid is that the learning process will pick up on anything
-that the labeled images have in common with each other, and if you're not
-careful that might be something that's not useful. For example if you photograph
-one kind of object in a blue room, and another in a green one, then the model
-will end up basing its prediction on the background color, not the features of
-the object you actually care about. To avoid this, try to take pictures in as
-wide a variety of situations as you can, at different times, and with different
-devices. If you want to know more about this problem, you can read about the
-classic (and possibly apocryphal)
-[tank recognition problem](https://www.jefftk.com/p/detecting-tanks).
-
-You may also want to think about the categories you use. It might be worth
-splitting big categories that cover a lot of different physical forms into
-smaller ones that are more visually distinct. For example instead of 'vehicle'
-you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
-whether you have a 'closed world' or an 'open world' problem. In a closed world,
-the only things you'll ever be asked to categorize are the classes of object you
-know about. This might apply to a plant recognition app where you know the user
-is likely to be taking a picture of a flower, so all you have to do is decide
-which species. By contrast a roaming robot might see all sorts of different
-things through its camera as it wanders around the world. In that case you'd
-want the classifier to report if it wasn't sure what it was seeing. This can be
-hard to do well, but often if you collect a large number of typical 'background'
-photos with no relevant objects in them, you can add them to an extra 'unknown'
-class in your image folders.
-
-It's also worth checking to make sure that all of your images are labeled
-correctly. Often user-generated tags are unreliable for our purposes, for
-example using #daisy for pictures of a person named Daisy. If you go through
-your images and weed out any mistakes it can do wonders for your overall
-accuracy.
-
-## Training Steps
-
-If you're happy with your images, you can take a look at improving your results
-by altering the details of the learning process. The simplest one to try is
-`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
-8,000 it will train for twice as long. The rate of improvement in the accuracy
-slows the longer you train for, and at some point will stop altogether, but you
-can experiment to see when you hit that limit for your model.
-
-## Distortions
-
-A common way of improving the results of image training is by deforming,
-cropping, or brightening the training inputs in random ways. This has the
-advantage of expanding the effective size of the training data thanks to all the
-possible variations of the same images, and tends to help the network learn to
-cope with all the distortions that will occur in real-life uses of the
-classifier. The biggest disadvantage of enabling these distortions in our script
-is that the bottleneck caching is no longer useful, since input images are never
-reused exactly. This means the training process takes a lot longer, so I
-recommend trying this as a way of fine-tuning your model once you've got one
-that you're reasonably happy with.
-
-You enable these distortions by passing `--random_crop`, `--random_scale` and
-`--random_brightness` to the script. These are all percentage values that
-control how much of each of the distortions is applied to each image. It's
-reasonable to start with values of 5 or 10 for each of them and then experiment
-to see which of them help with your application. `--flip_left_right` will
-randomly mirror half of the images horizontally, which makes sense as long as
-those inversions are likely to happen in your application. For example it
-wouldn't be a good idea if you were trying to recognize letters, since flipping
-them destroys their meaning.
-
-## Hyper-parameters
-
-There are several other parameters you can try adjusting to see if they help
-your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
-will take longer, but it can end up helping the overall precision. That's not
-always the case though, so you need to experiment carefully to see what works
-for your case. The `--train_batch_size` controls how many images are examined
-during one training step, and because the learning rate is applied per batch
-you'll need to reduce it if you have larger batches to get the same overall
-effect.
-
-## Training, Validation, and Testing Sets
-
-One of the things the script does under the hood when you point it at a folder
-of images is divide them up into three different sets. The largest is usually
-the training set, which are all the images fed into the network during training,
-with the results used to update the model's weights. You might wonder why we
-don't use all the images for training? A big potential problem when we're doing
-machine learning is that our model may just be memorizing irrelevant details of
-the training images to come up with the right answers. For example, you could
-imagine a network remembering a pattern in the background of each photo it was
-shown, and using that to match labels with objects. It could produce good
-results on all the images it's seen before during training, but then fail on new
-images because it's not learned general characteristics of the objects, just
-memorized unimportant details of the training images.
-
-This problem is known as overfitting, and to avoid it we keep some of our data
-out of the training process, so that the model can't memorize them. We then use
-those images as a check to make sure that overfitting isn't occurring, since if
-we see good accuracy on them it's a good sign the network isn't overfitting. The
-usual split is to put 80% of the images into the main training set, keep 10%
-aside to run as validation frequently during training, and then have a final 10%
-that are used less often as a testing set to predict the real-world performance
-of the classifier. These ratios can be controlled using the
-`--testing_percentage` and `--validation_percentage` flags. In general
-you should be able to leave these values at their defaults, since you won't
-usually find any advantage to training to adjusting them.
-
-Note that the script uses the image filenames (rather than a completely random
-function) to divide the images among the training, validation, and test sets.
-This is done to ensure that images don't get moved between training and testing
-sets on different runs, since that could be a problem if images that had been
-used for training a model were subsequently used in a validation set.
-
-You might notice that the validation accuracy fluctuates among iterations. Much
-of this fluctuation arises from the fact that a random subset of the validation
-set is chosen for each validation accuracy measurement. The fluctuations can be
-greatly reduced, at the cost of some increase in training time, by choosing
-`--validation_batch_size=-1`, which uses the entire validation set for each
-accuracy computation.
-
-Once training is complete, you may find it insightful to examine misclassified
-images in the test set. This can be done by adding the flag
-`--print_misclassified_test_images`. This may help you get a feeling for which
-types of images were most confusing for the model, and which categories were
-most difficult to distinguish. For instance, you might discover that some
-subtype of a particular category, or some unusual photo angle, is particularly
-difficult to identify, which may encourage you to add more training images of
-that subtype. Oftentimes, examining misclassified images can also point to
-errors in the input data set, such as mislabeled, low-quality, or ambiguous
-images. However, one should generally avoid point-fixing individual errors in
-the test set, since they are likely to merely reflect more general problems in
-the (much larger) training set.
-
-## Other Model Architectures
-
-By default the script uses a pretrained version of the Inception v3 model
-architecture. This is a good place to start because it provides high accuracy
-results, but if you intend to deploy your model on mobile devices or other
-resource-constrained environments you may want to trade off a little accuracy
-for much smaller file sizes or faster speeds. To help with that, the
-[retrain.py script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py)
-supports different variations on the [Mobilenet architecture](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html).
-
-These are a little less precise than Inception v3, but can result in far
-smaller file sizes (a few megabytes) and can be many times faster
-to run. To train with one of these models, pass in the `--architecture` flag,
-for example:
-
-```
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_0.25_128
-```
-
-This will create a 1.9MB model file in `/tmp/output_graph.pb`, with only 25% of
-the number of neurons of the full Mobilenet, and trained to take 128x128 sized
-input images.
-
-You can choose '1.0', '0.75', '0.50', or '0.25' to control the number of
-neurons (activations of hidden layers); the number of weights (and hence to
-some extent the file size and speed) shrinks like the square of that fraction.
-You can choose '224', '192', '160', or '128' for the input image size,
-with smaller sizes giving faster speeds.
-
-The speed and size advantages come at a loss to accuracy of course, but for many
-purposes this isn't critical. They can also be somewhat offset with improved
-training data. For example, training with distortions allows me to get above 80%
-accuracy on the flower data set even with the 0.25/128 graph above.
-
-If you're going to be using the Mobilenet models in label_image or your own
-programs, you'll need to feed in an image of the specified size converted to a
-float range into the 'input' tensor. Typically 24-bit images are in the range
-[0,255], and you must convert them to the [-1,1] float range expected by the
-model with the formula  `(image - 128.)/128.`.
-
-The default arguments for the `label_image` script are set for Inception V3.
-To use it with a MobileNet, specify the above normalization parameters as
-`input_mean` and `input_std` on the command line. You also must specify the
-image size that your model expects, as follows:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=input \
---output_layer=final_result \
---input_height=224 --input_width=224 \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-For more information on deploying the retrained model to a mobile device, see
-the [codelab version](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0)
-of this tutorial, especially [part 2](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/#0), which describes
-[TensorFlow Lite](/mobile/tflite/) and the additional optimizations it offers
-(including quantization of model weights).
+**NOTE: This tutorial has moved to**
+https://github.com/tensorflow/hub/tree/master/docs/tutorials/image_retraining.md
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index a088d7cf2f05c81cf2e60cb5aa8de79957a30de2..aa594a63c6ad5ab7129e452e7a6345114b994231 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -76,7 +76,6 @@ android_binary(
     custom_package = "org.tensorflow.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
-    manifest_merger = "legacy",
     resource_files = glob(["res/**"]),
     tags = [
         "manual",
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
deleted file mode 100644
index ecd79a3b004d0ca9f50d2a6f140dbc353efe30cb..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/BUILD
+++ /dev/null
@@ -1,51 +0,0 @@
-# Description:
-# Transfer learning example for TensorFlow.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-py_binary(
-    name = "retrain",
-    srcs = [
-        "retrain.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "retrain_test",
-    size = "small",
-    srcs = [
-        "retrain.py",
-        "retrain_test.py",
-    ],
-    data = [
-        ":data/labels.txt",
-        "//tensorflow/examples/label_image:data/grace_hopper.jpg",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":retrain",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/image_retraining/README.md b/tensorflow/examples/image_retraining/README.md
index 8a49525c6eff003f2c7acb592f213285e627eb51..3f0b3d12682b81e7c13ed6c2d32149746d506cd3 100644
--- a/tensorflow/examples/image_retraining/README.md
+++ b/tensorflow/examples/image_retraining/README.md
@@ -1,12 +1,15 @@
-retrain.py is an example script that shows how one can adapt a pretrained
-network for other classification problems. A detailed overview of this script
-can be found at:
-https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0
+**NOTE: This code has moved to**
+https://github.com/tensorflow/hub/tree/master/examples/image_retraining
 
-The script also shows how one can train layers
-with quantized weights and activations instead of taking a pre-trained floating
-point model and then quantizing weights and activations.
-The output graphdef produced by this script is compatible with the TensorFlow
-Lite Optimizing Converter and can be converted to TFLite format.
+retrain.py is an example script that shows how one can adapt a pretrained
+network for other classification problems (including use with TFLite and
+quantization).
 
+As of TensorFlow 1.7, it is recommended to use a pretrained network from
+TensorFlow Hub, using the new version of this example found in the location
+above, as explained in TensorFlow's revised [image retraining
+tutorial](https://www.tensorflow.org/tutorials/image_retraining).
 
+Older versions of this example (using frozen GraphDefs instead of
+TensorFlow Hub modules) are available in the release branches of
+TensorFlow versions up to and including 1.7.
diff --git a/tensorflow/examples/image_retraining/__init__.py b/tensorflow/examples/image_retraining/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/tensorflow/examples/image_retraining/data/labels.txt b/tensorflow/examples/image_retraining/data/labels.txt
deleted file mode 100644
index bc1131ac4591ca1bdb840695b55f79a6feb95db3..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/data/labels.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Runner-up
-Winner
-Loser
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
deleted file mode 100644
index fcc191250fe8c9d80e788b6d345b041c7ea22f2f..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/retrain.py
+++ /dev/null
@@ -1,1487 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Simple transfer learning with Inception v3 or Mobilenet models.
-
-With support for TensorBoard.
-
-This example shows how to take a Inception v3 or Mobilenet model trained on
-ImageNet images, and train a new top layer that can recognize other classes of
-images.
-
-The top layer receives as input a 2048-dimensional vector (1001-dimensional for
-Mobilenet) for each image. We train a softmax layer on top of this
-representation. Assuming the softmax layer contains N labels, this corresponds
-to learning N + 2048*N (or 1001*N)  model parameters corresponding to the
-learned biases and weights.
-
-Here's an example, which assumes you have a folder containing class-named
-subfolders, each full of images for each label. The example folder flower_photos
-should have a structure like this:
-
-~/flower_photos/daisy/photo1.jpg
-~/flower_photos/daisy/photo2.jpg
-...
-~/flower_photos/rose/anotherphoto77.jpg
-...
-~/flower_photos/sunflower/somepicture.jpg
-
-The subfolder names are important, since they define what label is applied to
-each image, but the filenames themselves don't matter. Once your images are
-prepared, you can run the training with a command like this:
-
-```bash
-bazel build tensorflow/examples/image_retraining:retrain && \
-bazel-bin/tensorflow/examples/image_retraining/retrain \
-    --image_dir ~/flower_photos
-```
-
-Or, if you have a pip installation of tensorflow, `retrain.py` can be run
-without bazel:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos
-```
-
-You can replace the image_dir argument with any folder containing subfolders of
-images. The label for each image is taken from the name of the subfolder it's
-in.
-
-This produces a new model file that can be loaded and run by any TensorFlow
-program, for example the label_image sample code.
-
-By default this script will use the high accuracy, but comparatively large and
-slow Inception v3 model architecture. It's recommended that you start with this
-to validate that you have gathered good training data, but if you want to deploy
-on resource-limited platforms, you can try the `--architecture` flag with a
-Mobilenet model. For example:
-
-Run floating-point version of mobilenet:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_1.0_224
-```
-
-Run mobilenet, instrumented for quantization:
-
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos/   --architecture mobilenet_1.0_224_quant
-```
-
-These instrumented models can be converted to fully quantized mobile models via
-TensorFlow Lite.
-
-There are 32 different Mobilenet models to choose from, with a variety of file
-size and latency options. The first number can be '1.0', '0.75', '0.50', or
-'0.25' to control the size, and the second controls the input image size, either
-'224', '192', '160', or '128', with smaller sizes running faster. See
-https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-for more information on Mobilenet.
-
-To use with TensorBoard:
-
-By default, this script will log summaries to /tmp/retrain_logs directory
-
-Visualize the summaries with this command:
-
-tensorboard --logdir /tmp/retrain_logs
-
-To use with Tensorflow Serving:
-
-```bash
-tensorflow_model_server --port=9000 --model_name=inception \
-    --model_base_path=/tmp/saved_models/
-```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from datetime import datetime
-import hashlib
-import os.path
-import random
-import re
-import sys
-import tarfile
-
-import numpy as np
-from six.moves import urllib
-import tensorflow as tf
-
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import compat
-
-FLAGS = None
-
-# These are all parameters that are tied to the particular model architecture
-# we're using for Inception v3. These include things like tensor names and their
-# sizes. If you want to adapt this script to work with another model, you will
-# need to update these to reflect the values in the network you're using.
-MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1  # ~134M
-
-# The location where variable checkpoints will be stored.
-CHECKPOINT_NAME = '/tmp/_retrain_checkpoint'
-
-
-def create_image_lists(image_dir, testing_percentage, validation_percentage):
-  """Builds a list of training images from the file system.
-
-  Analyzes the sub folders in the image directory, splits them into stable
-  training, testing, and validation sets, and returns a data structure
-  describing the lists of images for each label and their paths.
-
-  Args:
-    image_dir: String path to a folder containing subfolders of images.
-    testing_percentage: Integer percentage of the images to reserve for tests.
-    validation_percentage: Integer percentage of images reserved for validation.
-
-  Returns:
-    A dictionary containing an entry for each label subfolder, with images split
-    into training, testing, and validation sets within each label.
-  """
-  if not gfile.Exists(image_dir):
-    tf.logging.error("Image directory '" + image_dir + "' not found.")
-    return None
-  result = {}
-  sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
-  # The root directory comes first, so skip it.
-  is_root_dir = True
-  for sub_dir in sub_dirs:
-    if is_root_dir:
-      is_root_dir = False
-      continue
-    extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
-    file_list = []
-    dir_name = os.path.basename(sub_dir)
-    if dir_name == image_dir:
-      continue
-    tf.logging.info("Looking for images in '" + dir_name + "'")
-    for extension in extensions:
-      file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
-      file_list.extend(gfile.Glob(file_glob))
-    if not file_list:
-      tf.logging.warning('No files found')
-      continue
-    if len(file_list) < 20:
-      tf.logging.warning(
-          'WARNING: Folder has less than 20 images, which may cause issues.')
-    elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
-      tf.logging.warning(
-          'WARNING: Folder {} has more than {} images. Some images will '
-          'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
-    label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
-    training_images = []
-    testing_images = []
-    validation_images = []
-    for file_name in file_list:
-      base_name = os.path.basename(file_name)
-      # We want to ignore anything after '_nohash_' in the file name when
-      # deciding which set to put an image in, the data set creator has a way of
-      # grouping photos that are close variations of each other. For example
-      # this is used in the plant disease data set to group multiple pictures of
-      # the same leaf.
-      hash_name = re.sub(r'_nohash_.*$', '', file_name)
-      # This looks a bit magical, but we need to decide whether this file should
-      # go into the training, testing, or validation sets, and we want to keep
-      # existing files in the same set even if more files are subsequently
-      # added.
-      # To do that, we need a stable way of deciding based on just the file name
-      # itself, so we do a hash of that and then use that to generate a
-      # probability value that we use to assign it.
-      hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
-      percentage_hash = ((int(hash_name_hashed, 16) %
-                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
-                         (100.0 / MAX_NUM_IMAGES_PER_CLASS))
-      if percentage_hash < validation_percentage:
-        validation_images.append(base_name)
-      elif percentage_hash < (testing_percentage + validation_percentage):
-        testing_images.append(base_name)
-      else:
-        training_images.append(base_name)
-    result[label_name] = {
-        'dir': dir_name,
-        'training': training_images,
-        'testing': testing_images,
-        'validation': validation_images,
-    }
-  return result
-
-
-def get_image_path(image_lists, label_name, index, image_dir, category):
-  """"Returns a path to an image for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Int offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-
-  """
-  if label_name not in image_lists:
-    tf.logging.fatal('Label does not exist %s.', label_name)
-  label_lists = image_lists[label_name]
-  if category not in label_lists:
-    tf.logging.fatal('Category does not exist %s.', category)
-  category_list = label_lists[category]
-  if not category_list:
-    tf.logging.fatal('Label %s has no images in the category %s.',
-                     label_name, category)
-  mod_index = index % len(category_list)
-  base_name = category_list[mod_index]
-  sub_dir = label_lists['dir']
-  full_path = os.path.join(image_dir, sub_dir, base_name)
-  return full_path
-
-
-def get_bottleneck_path(image_lists, label_name, index, bottleneck_dir,
-                        category, architecture):
-  """"Returns a path to a bottleneck file for a label at the given index.
-
-  Args:
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be moduloed by the
-    available number of images for the label, so it can be arbitrarily large.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    category: Name string of set to pull images from - training, testing, or
-    validation.
-    architecture: The name of the model architecture.
-
-  Returns:
-    File system path string to an image that meets the requested parameters.
-  """
-  return get_image_path(image_lists, label_name, index, bottleneck_dir,
-                        category) + '_' + architecture + '.txt'
-
-
-def create_model_graph(model_info):
-  """"Creates a graph from saved GraphDef file and returns a Graph object.
-
-  Args:
-    model_info: Dictionary containing information about the model architecture.
-
-  Returns:
-    Graph holding the trained Inception network, and various tensors we'll be
-    manipulating.
-  """
-  with tf.Graph().as_default() as graph:
-    model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
-    print('Model path: ', model_path)
-    with gfile.FastGFile(model_path, 'rb') as f:
-      graph_def = tf.GraphDef()
-      graph_def.ParseFromString(f.read())
-      bottleneck_tensor, resized_input_tensor = (tf.import_graph_def(
-          graph_def,
-          name='',
-          return_elements=[
-              model_info['bottleneck_tensor_name'],
-              model_info['resized_input_tensor_name'],
-          ]))
-  return graph, bottleneck_tensor, resized_input_tensor
-
-
-def run_bottleneck_on_image(sess, image_data, image_data_tensor,
-                            decoded_image_tensor, resized_input_tensor,
-                            bottleneck_tensor):
-  """Runs inference on an image to extract the 'bottleneck' summary layer.
-
-  Args:
-    sess: Current active TensorFlow Session.
-    image_data: String of raw JPEG data.
-    image_data_tensor: Input data layer in the graph.
-    decoded_image_tensor: Output of initial image resizing and preprocessing.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: Layer before the final softmax.
-
-  Returns:
-    Numpy array of bottleneck values.
-  """
-  # First decode the JPEG image, resize it, and rescale the pixel values.
-  resized_input_values = sess.run(decoded_image_tensor,
-                                  {image_data_tensor: image_data})
-  # Then run it through the recognition network.
-  bottleneck_values = sess.run(bottleneck_tensor,
-                               {resized_input_tensor: resized_input_values})
-  bottleneck_values = np.squeeze(bottleneck_values)
-  return bottleneck_values
-
-
-def maybe_download_and_extract(data_url):
-  """Download and extract model tar file.
-
-  If the pretrained model we're using doesn't already exist, this function
-  downloads it from the TensorFlow.org website and unpacks it into a directory.
-
-  Args:
-    data_url: Web location of the tar file containing the pretrained model.
-  """
-  dest_directory = FLAGS.model_dir
-  if not os.path.exists(dest_directory):
-    os.makedirs(dest_directory)
-  filename = data_url.split('/')[-1]
-  filepath = os.path.join(dest_directory, filename)
-  if not os.path.exists(filepath):
-
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' %
-                       (filename,
-                        float(count * block_size) / float(total_size) * 100.0))
-      sys.stdout.flush()
-
-    filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded %s %d bytes.', filename,
-                    statinfo.st_size)
-    print('Extracting file from ', filepath)
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
-  else:
-    print('Not extracting or downloading files, model already present in disk')
-
-
-def ensure_dir_exists(dir_name):
-  """Makes sure the folder exists on disk.
-
-  Args:
-    dir_name: Path string to the folder we want to create.
-  """
-  if not os.path.exists(dir_name):
-    os.makedirs(dir_name)
-
-
-bottleneck_path_2_bottleneck_values = {}
-
-
-def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor):
-  """Create a single bottleneck file."""
-  tf.logging.info('Creating bottleneck at ' + bottleneck_path)
-  image_path = get_image_path(image_lists, label_name, index,
-                              image_dir, category)
-  if not gfile.Exists(image_path):
-    tf.logging.fatal('File does not exist %s', image_path)
-  image_data = gfile.FastGFile(image_path, 'rb').read()
-  try:
-    bottleneck_values = run_bottleneck_on_image(
-        sess, image_data, jpeg_data_tensor, decoded_image_tensor,
-        resized_input_tensor, bottleneck_tensor)
-  except Exception as e:
-    raise RuntimeError('Error during processing file %s (%s)' % (image_path,
-                                                                 str(e)))
-  bottleneck_string = ','.join(str(x) for x in bottleneck_values)
-  with open(bottleneck_path, 'w') as bottleneck_file:
-    bottleneck_file.write(bottleneck_string)
-
-
-def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
-                             category, bottleneck_dir, jpeg_data_tensor,
-                             decoded_image_tensor, resized_input_tensor,
-                             bottleneck_tensor, architecture):
-  """Retrieves or calculates bottleneck values for an image.
-
-  If a cached version of the bottleneck data exists on-disk, return that,
-  otherwise calculate the data and save it to disk for future use.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    label_name: Label string we want to get an image for.
-    index: Integer offset of the image we want. This will be modulo-ed by the
-    available number of images for the label, so it can be arbitrarily large.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    category: Name string of which set to pull images from - training, testing,
-    or validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: The tensor to feed loaded jpeg data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The output tensor for the bottleneck values.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Numpy array of values produced by the bottleneck layer for the image.
-  """
-  label_lists = image_lists[label_name]
-  sub_dir = label_lists['dir']
-  sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
-  ensure_dir_exists(sub_dir_path)
-  bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
-                                        bottleneck_dir, category, architecture)
-  if not os.path.exists(bottleneck_path):
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-  with open(bottleneck_path, 'r') as bottleneck_file:
-    bottleneck_string = bottleneck_file.read()
-  did_hit_error = False
-  try:
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  except ValueError:
-    tf.logging.warning('Invalid float found, recreating bottleneck')
-    did_hit_error = True
-  if did_hit_error:
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor,
-                           decoded_image_tensor, resized_input_tensor,
-                           bottleneck_tensor)
-    with open(bottleneck_path, 'r') as bottleneck_file:
-      bottleneck_string = bottleneck_file.read()
-    # Allow exceptions to propagate here, since they shouldn't happen after a
-    # fresh creation
-    bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  return bottleneck_values
-
-
-def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
-                      jpeg_data_tensor, decoded_image_tensor,
-                      resized_input_tensor, bottleneck_tensor, architecture):
-  """Ensures all the training, testing, and validation bottlenecks are cached.
-
-  Because we're likely to read the same image multiple times (if there are no
-  distortions applied during training) it can speed things up a lot if we
-  calculate the bottleneck layer values once for each image during
-  preprocessing, and then just read those cached values repeatedly during
-  training. Here we go through all the images we've found, calculate those
-  values, and save them off.
-
-  Args:
-    sess: The current active TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    jpeg_data_tensor: Input tensor for jpeg data from file.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The penultimate output layer of the graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    Nothing.
-  """
-  how_many_bottlenecks = 0
-  ensure_dir_exists(bottleneck_dir)
-  for label_name, label_lists in image_lists.items():
-    for category in ['training', 'testing', 'validation']:
-      category_list = label_lists[category]
-      for index, unused_base_name in enumerate(category_list):
-        get_or_create_bottleneck(
-            sess, image_lists, label_name, index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-
-        how_many_bottlenecks += 1
-        if how_many_bottlenecks % 100 == 0:
-          tf.logging.info(
-              str(how_many_bottlenecks) + ' bottleneck files created.')
-
-
-def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
-                                  bottleneck_dir, image_dir, jpeg_data_tensor,
-                                  decoded_image_tensor, resized_input_tensor,
-                                  bottleneck_tensor, architecture):
-  """Retrieves bottleneck values for cached images.
-
-  If no distortions are being applied, this function can retrieve the cached
-  bottleneck values directly from disk for images. It picks a random set of
-  images from the specified category.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: If positive, a random sample of this size will be chosen.
-    If negative, all bottlenecks will be retrieved.
-    category: Name string of which set to pull from - training, testing, or
-    validation.
-    bottleneck_dir: Folder string holding cached files of bottleneck values.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    jpeg_data_tensor: The layer to feed jpeg image data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-    architecture: The name of the model architecture.
-
-  Returns:
-    List of bottleneck arrays, their corresponding ground truths, and the
-    relevant filenames.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  filenames = []
-  if how_many >= 0:
-    # Retrieve a random sample of bottlenecks.
-    for unused_i in range(how_many):
-      label_index = random.randrange(class_count)
-      label_name = list(image_lists.keys())[label_index]
-      image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-      image_name = get_image_path(image_lists, label_name, image_index,
-                                  image_dir, category)
-      bottleneck = get_or_create_bottleneck(
-          sess, image_lists, label_name, image_index, image_dir, category,
-          bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-          resized_input_tensor, bottleneck_tensor, architecture)
-      bottlenecks.append(bottleneck)
-      ground_truths.append(label_index)
-      filenames.append(image_name)
-  else:
-    # Retrieve all bottlenecks.
-    for label_index, label_name in enumerate(image_lists.keys()):
-      for image_index, image_name in enumerate(
-          image_lists[label_name][category]):
-        image_name = get_image_path(image_lists, label_name, image_index,
-                                    image_dir, category)
-        bottleneck = get_or_create_bottleneck(
-            sess, image_lists, label_name, image_index, image_dir, category,
-            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
-            resized_input_tensor, bottleneck_tensor, architecture)
-        bottlenecks.append(bottleneck)
-        ground_truths.append(label_index)
-        filenames.append(image_name)
-  return bottlenecks, ground_truths, filenames
-
-
-def get_random_distorted_bottlenecks(
-    sess, image_lists, how_many, category, image_dir, input_jpeg_tensor,
-    distorted_image, resized_input_tensor, bottleneck_tensor):
-  """Retrieves bottleneck values for training images, after distortions.
-
-  If we're training with distortions like crops, scales, or flips, we have to
-  recalculate the full model for every image, and so we can't use cached
-  bottleneck values. Instead we find random images for the requested category,
-  run them through the distortion graph, and then the full graph to get the
-  bottleneck results for each.
-
-  Args:
-    sess: Current TensorFlow Session.
-    image_lists: Dictionary of training images for each label.
-    how_many: The integer number of bottleneck values to return.
-    category: Name string of which set of images to fetch - training, testing,
-    or validation.
-    image_dir: Root folder string of the subfolders containing the training
-    images.
-    input_jpeg_tensor: The input layer we feed the image data to.
-    distorted_image: The output node of the distortion graph.
-    resized_input_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-
-  Returns:
-    List of bottleneck arrays and their corresponding ground truths.
-  """
-  class_count = len(image_lists.keys())
-  bottlenecks = []
-  ground_truths = []
-  for unused_i in range(how_many):
-    label_index = random.randrange(class_count)
-    label_name = list(image_lists.keys())[label_index]
-    image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
-    image_path = get_image_path(image_lists, label_name, image_index, image_dir,
-                                category)
-    if not gfile.Exists(image_path):
-      tf.logging.fatal('File does not exist %s', image_path)
-    jpeg_data = gfile.FastGFile(image_path, 'rb').read()
-    # Note that we materialize the distorted_image_data as a numpy array before
-    # sending running inference on the image. This involves 2 memory copies and
-    # might be optimized in other implementations.
-    distorted_image_data = sess.run(distorted_image,
-                                    {input_jpeg_tensor: jpeg_data})
-    bottleneck_values = sess.run(bottleneck_tensor,
-                                 {resized_input_tensor: distorted_image_data})
-    bottleneck_values = np.squeeze(bottleneck_values)
-    bottlenecks.append(bottleneck_values)
-    ground_truths.append(label_index)
-  return bottlenecks, ground_truths
-
-
-def should_distort_images(flip_left_right, random_crop, random_scale,
-                          random_brightness):
-  """Whether any distortions are enabled, from the input flags.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-
-  Returns:
-    Boolean value indicating whether any distortions should be applied.
-  """
-  return (flip_left_right or (random_crop != 0) or (random_scale != 0) or
-          (random_brightness != 0))
-
-
-def add_input_distortions(flip_left_right, random_crop, random_scale,
-                          random_brightness, input_width, input_height,
-                          input_depth, input_mean, input_std):
-  """Creates the operations to apply the specified distortions.
-
-  During training it can help to improve the results if we run the images
-  through simple distortions like crops, scales, and flips. These reflect the
-  kind of variations we expect in the real world, and so can help train the
-  model to cope with natural data more effectively. Here we take the supplied
-  parameters and construct a network of operations to apply them to an image.
-
-  Cropping
-  ~~~~~~~~
-
-  Cropping is done by placing a bounding box at a random position in the full
-  image. The cropping parameter controls the size of that box relative to the
-  input image. If it's zero, then the box is the same size as the input and no
-  cropping is performed. If the value is 50%, then the crop box will be half the
-  width and height of the input. In a diagram it looks like this:
-
-  <       width         >
-  +---------------------+
-  |                     |
-  |   width - crop%     |
-  |    <      >         |
-  |    +------+         |
-  |    |      |         |
-  |    |      |         |
-  |    |      |         |
-  |    +------+         |
-  |                     |
-  |                     |
-  +---------------------+
-
-  Scaling
-  ~~~~~~~
-
-  Scaling is a lot like cropping, except that the bounding box is always
-  centered and its size varies randomly within the given range. For example if
-  the scale percentage is zero, then the bounding box is the same size as the
-  input and no scaling is applied. If it's 50%, then the bounding box will be in
-  a random range between half the width and height and full size.
-
-  Args:
-    flip_left_right: Boolean whether to randomly mirror images horizontally.
-    random_crop: Integer percentage setting the total margin used around the
-    crop box.
-    random_scale: Integer percentage of how much to vary the scale by.
-    random_brightness: Integer range to randomly multiply the pixel values by.
-    graph.
-    input_width: Horizontal size of expected input image to model.
-    input_height: Vertical size of expected input image to model.
-    input_depth: How many channels the expected input image should have.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    The jpeg input layer and the distorted result tensor.
-  """
-
-  jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  margin_scale = 1.0 + (random_crop / 100.0)
-  resize_scale = 1.0 + (random_scale / 100.0)
-  margin_scale_value = tf.constant(margin_scale)
-  resize_scale_value = tf.random_uniform(tensor_shape.scalar(),
-                                         minval=1.0,
-                                         maxval=resize_scale)
-  scale_value = tf.multiply(margin_scale_value, resize_scale_value)
-  precrop_width = tf.multiply(scale_value, input_width)
-  precrop_height = tf.multiply(scale_value, input_height)
-  precrop_shape = tf.stack([precrop_height, precrop_width])
-  precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
-  precropped_image = tf.image.resize_bilinear(decoded_image_4d,
-                                              precrop_shape_as_int)
-  precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
-  cropped_image = tf.random_crop(precropped_image_3d,
-                                 [input_height, input_width, input_depth])
-  if flip_left_right:
-    flipped_image = tf.image.random_flip_left_right(cropped_image)
-  else:
-    flipped_image = cropped_image
-  brightness_min = 1.0 - (random_brightness / 100.0)
-  brightness_max = 1.0 + (random_brightness / 100.0)
-  brightness_value = tf.random_uniform(tensor_shape.scalar(),
-                                       minval=brightness_min,
-                                       maxval=brightness_max)
-  brightened_image = tf.multiply(flipped_image, brightness_value)
-  offset_image = tf.subtract(brightened_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  distort_result = tf.expand_dims(mul_image, 0, name='DistortResult')
-  return jpeg_data, distort_result
-
-
-def variable_summaries(var):
-  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-  with tf.name_scope('summaries'):
-    mean = tf.reduce_mean(var)
-    tf.summary.scalar('mean', mean)
-    with tf.name_scope('stddev'):
-      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
-    tf.summary.scalar('stddev', stddev)
-    tf.summary.scalar('max', tf.reduce_max(var))
-    tf.summary.scalar('min', tf.reduce_min(var))
-    tf.summary.histogram('histogram', var)
-
-
-def add_final_retrain_ops(class_count, final_tensor_name, bottleneck_tensor,
-                          bottleneck_tensor_size, quantize_layer, is_training):
-  """Adds a new softmax and fully-connected layer for training and eval.
-
-  We need to retrain the top layer to identify our new classes, so this function
-  adds the right operations to the graph, along with some variables to hold the
-  weights, and then sets up all the gradients for the backward pass.
-
-  The set up for the softmax and fully-connected layers is based on:
-  https://www.tensorflow.org/versions/master/tutorials/mnist/beginners/index.html
-
-  Args:
-    class_count: Integer of how many categories of things we're trying to
-        recognize.
-    final_tensor_name: Name string for the new final node that produces results.
-    bottleneck_tensor: The output of the main CNN graph.
-    bottleneck_tensor_size: How many entries in the bottleneck vector.
-    quantize_layer: Boolean, specifying whether the newly added layer should be
-        instrumented for quantized.
-    is_training: Boolean, specifying whether the newly add layer is for training
-        or eval.
-
-  Returns:
-    The tensors for the training and cross entropy results, and tensors for the
-    bottleneck input and ground truth input.
-  """
-  with tf.name_scope('input'):
-    bottleneck_input = tf.placeholder_with_default(
-        bottleneck_tensor,
-        shape=[None, bottleneck_tensor_size],
-        name='BottleneckInputPlaceholder')
-
-    ground_truth_input = tf.placeholder(
-        tf.int64, [None], name='GroundTruthInput')
-
-  # Organizing the following ops so they are easier to see in TensorBoard.
-  layer_name = 'final_retrain_ops'
-  with tf.name_scope(layer_name):
-    with tf.name_scope('weights'):
-      initial_value = tf.truncated_normal(
-          [bottleneck_tensor_size, class_count], stddev=0.001)
-      layer_weights = tf.Variable(initial_value, name='final_weights')
-      variable_summaries(layer_weights)
-
-    with tf.name_scope('biases'):
-      layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
-      variable_summaries(layer_biases)
-
-    with tf.name_scope('Wx_plus_b'):
-      logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
-      tf.summary.histogram('pre_activations', logits)
-
-  final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
-
-  # The tf.contrib.quantize functions rewrite the graph in place for
-  # quantization. The imported model graph has already been rewritten, so upon
-  # calling these rewrites, only the newly added final layer will be
-  # transformed.
-  if quantize_layer:
-    if is_training:
-      tf.contrib.quantize.create_training_graph()
-    else:
-      tf.contrib.quantize.create_eval_graph()
-
-  tf.summary.histogram('activations', final_tensor)
-
-  # If this is an eval graph, we don't need to add loss ops or an optimizer.
-  if not is_training:
-    return None, None, bottleneck_input, ground_truth_input, final_tensor
-
-  with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
-        labels=ground_truth_input, logits=logits)
-
-  tf.summary.scalar('cross_entropy', cross_entropy_mean)
-
-  with tf.name_scope('train'):
-    optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
-    train_step = optimizer.minimize(cross_entropy_mean)
-
-  return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
-          final_tensor)
-
-
-def add_evaluation_step(result_tensor, ground_truth_tensor):
-  """Inserts the operations we need to evaluate the accuracy of our results.
-
-  Args:
-    result_tensor: The new final node that produces results.
-    ground_truth_tensor: The node we feed ground truth data
-    into.
-
-  Returns:
-    Tuple of (evaluation step, prediction).
-  """
-  with tf.name_scope('accuracy'):
-    with tf.name_scope('correct_prediction'):
-      prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(prediction, ground_truth_tensor)
-    with tf.name_scope('accuracy'):
-      evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.summary.scalar('accuracy', evaluation_step)
-  return evaluation_step, prediction
-
-
-def run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
-                   decoded_image_tensor, resized_image_tensor,
-                   bottleneck_tensor):
-  """Runs a final evaluation on an eval graph using the test data set.
-
-  Args:
-    sess: Session for the train graph.
-    model_info: Model info dictionary from create_model_info()
-    class_count: Number of classes
-    image_lists: Dictionary of training images for each label.
-    jpeg_data_tensor: The layer to feed jpeg image data into.
-    decoded_image_tensor: The output of decoding and resizing the image.
-    resized_image_tensor: The input node of the recognition graph.
-    bottleneck_tensor: The bottleneck output layer of the CNN graph.
-  """
-  test_bottlenecks, test_ground_truth, test_filenames = (
-      get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
-                                    'testing', FLAGS.bottleneck_dir,
-                                    FLAGS.image_dir, jpeg_data_tensor,
-                                    decoded_image_tensor, resized_image_tensor,
-                                    bottleneck_tensor, FLAGS.architecture))
-
-  (sess, bottleneck_input, ground_truth_input, evaluation_step,
-   prediction) = build_eval_session(model_info, class_count)
-
-  test_accuracy, predictions = sess.run(
-      [evaluation_step, prediction],
-      feed_dict={
-          bottleneck_input: test_bottlenecks,
-          ground_truth_input: test_ground_truth
-      })
-  tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
-                  (test_accuracy * 100, len(test_bottlenecks)))
-
-  if FLAGS.print_misclassified_test_images:
-    tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
-    for i, test_filename in enumerate(test_filenames):
-      if predictions[i] != test_ground_truth[i]:
-        tf.logging.info('%70s  %s' % (test_filename,
-                                      list(image_lists.keys())[predictions[i]]))
-
-
-def build_eval_session(model_info, class_count):
-  """Builds an restored eval session without train operations for exporting.
-
-  Args:
-    model_info: Model info dictionary from create_model_info()
-    class_count: Number of classes
-
-  Returns:
-    Eval session containing the restored eval graph.
-    The bottleneck input, ground truth, eval step, and prediction tensors.
-  """
-  # If quantized, we need to create the correct eval graph for exporting.
-  eval_graph, bottleneck_tensor, _ = create_model_graph(model_info)
-
-  eval_sess = tf.Session(graph=eval_graph)
-  with eval_graph.as_default():
-    # Add the new layer for exporting.
-    (_, _, bottleneck_input,
-     ground_truth_input, final_tensor) = add_final_retrain_ops(
-         class_count, FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'],
-         False)
-
-    # Now we need to restore the values from the training graph to the eval
-    # graph.
-    tf.train.Saver().restore(eval_sess, CHECKPOINT_NAME)
-
-    evaluation_step, prediction = add_evaluation_step(final_tensor,
-                                                      ground_truth_input)
-
-  return (eval_sess, bottleneck_input, ground_truth_input, evaluation_step,
-          prediction)
-
-
-def save_graph_to_file(graph, graph_file_name, model_info, class_count):
-  """Saves an graph to file, creating a valid quantized one if necessary."""
-  sess, _, _, _, _ = build_eval_session(model_info, class_count)
-  graph = sess.graph
-
-  output_graph_def = graph_util.convert_variables_to_constants(
-      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-
-  with gfile.FastGFile(graph_file_name, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-
-
-def prepare_file_system():
-  # Setup the directory we'll write summaries to for TensorBoard
-  if tf.gfile.Exists(FLAGS.summaries_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
-  if FLAGS.intermediate_store_frequency > 0:
-    ensure_dir_exists(FLAGS.intermediate_output_graphs_dir)
-  return
-
-
-def create_model_info(architecture):
-  """Given the name of a model architecture, returns information about it.
-
-  There are different base image recognition pretrained models that can be
-  retrained using transfer learning, and this function translates from the name
-  of a model to the attributes that are needed to download and train with it.
-
-  Args:
-    architecture: Name of a model architecture.
-
-  Returns:
-    Dictionary of information about the model, or None if the name isn't
-    recognized
-
-  Raises:
-    ValueError: If architecture name is unknown.
-  """
-  architecture = architecture.lower()
-  is_quantized = False
-  if architecture == 'inception_v3':
-    # pylint: disable=line-too-long
-    data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
-    # pylint: enable=line-too-long
-    bottleneck_tensor_name = 'pool_3/_reshape:0'
-    bottleneck_tensor_size = 2048
-    input_width = 299
-    input_height = 299
-    input_depth = 3
-    resized_input_tensor_name = 'Mul:0'
-    model_file_name = 'classify_image_graph_def.pb'
-    input_mean = 128
-    input_std = 128
-  elif architecture.startswith('mobilenet_'):
-    parts = architecture.split('_')
-    if len(parts) != 3 and len(parts) != 4:
-      tf.logging.error("Couldn't understand architecture name '%s'",
-                       architecture)
-      return None
-    version_string = parts[1]
-    if (version_string != '1.0' and version_string != '0.75' and
-        version_string != '0.5' and version_string != '0.25'):
-      tf.logging.error(
-          """"The Mobilenet version should be '1.0', '0.75', '0.5', or '0.25',
-  but found '%s' for architecture '%s'""", version_string, architecture)
-      return None
-    size_string = parts[2]
-    if (size_string != '224' and size_string != '192' and
-        size_string != '160' and size_string != '128'):
-      tf.logging.error(
-          """The Mobilenet input size should be '224', '192', '160', or '128',
- but found '%s' for architecture '%s'""",
-          size_string, architecture)
-      return None
-    if len(parts) == 3:
-      is_quantized = False
-    else:
-      if parts[3] != 'quant':
-        tf.logging.error(
-            "Couldn't understand architecture suffix '%s' for '%s'", parts[3],
-            architecture)
-        return None
-      is_quantized = True
-
-    data_url = 'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/'
-    model_name = 'mobilenet_v1_' + version_string + '_' + size_string
-    if is_quantized:
-      model_name += '_quant'
-    data_url += model_name + '.tgz'
-    bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-    resized_input_tensor_name = 'input:0'
-    model_file_name = model_name + '_frozen.pb'
-
-    bottleneck_tensor_size = 1001
-    input_width = int(size_string)
-    input_height = int(size_string)
-    input_depth = 3
-    input_mean = 127.5
-    input_std = 127.5
-  else:
-    tf.logging.error("Couldn't understand architecture name '%s'", architecture)
-    raise ValueError('Unknown architecture', architecture)
-
-  return {
-      'data_url': data_url,
-      'bottleneck_tensor_name': bottleneck_tensor_name,
-      'bottleneck_tensor_size': bottleneck_tensor_size,
-      'input_width': input_width,
-      'input_height': input_height,
-      'input_depth': input_depth,
-      'resized_input_tensor_name': resized_input_tensor_name,
-      'model_file_name': model_file_name,
-      'input_mean': input_mean,
-      'input_std': input_std,
-      'quantize_layer': is_quantized,
-  }
-
-
-def add_jpeg_decoding(input_width, input_height, input_depth, input_mean,
-                      input_std):
-  """Adds operations that perform JPEG decoding and resizing to the graph..
-
-  Args:
-    input_width: Desired width of the image fed into the recognizer graph.
-    input_height: Desired width of the image fed into the recognizer graph.
-    input_depth: Desired channels of the image fed into the recognizer graph.
-    input_mean: Pixel value that should be zero in the image for the graph.
-    input_std: How much to divide the pixel values by before recognition.
-
-  Returns:
-    Tensors for the node to feed JPEG data into, and the output of the
-      preprocessing steps.
-  """
-  jpeg_data = tf.placeholder(tf.string, name='DecodeJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
-  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
-  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
-  resize_shape = tf.stack([input_height, input_width])
-  resize_shape_as_int = tf.cast(resize_shape, dtype=tf.int32)
-  resized_image = tf.image.resize_bilinear(decoded_image_4d,
-                                           resize_shape_as_int)
-  offset_image = tf.subtract(resized_image, input_mean)
-  mul_image = tf.multiply(offset_image, 1.0 / input_std)
-  return jpeg_data, mul_image
-
-
-def export_model(model_info, class_count, saved_model_dir):
-  """Exports model for serving.
-
-  Args:
-    model_info: The modelinfo for the current model.
-    class_count: The number of classes.
-    saved_model_dir: Directory in which to save exported model and variables.
-  """
-  # The SavedModel should hold the eval graph.
-  sess, _, _, _, _ = build_eval_session(model_info, class_count)
-  graph = sess.graph
-  with graph.as_default():
-    input_tensor = model_info['resized_input_tensor_name']
-    in_image = sess.graph.get_tensor_by_name(input_tensor)
-    inputs = {'image': tf.saved_model.utils.build_tensor_info(in_image)}
-
-    out_classes = sess.graph.get_tensor_by_name('final_result:0')
-    outputs = {
-        'prediction': tf.saved_model.utils.build_tensor_info(out_classes)
-    }
-
-    signature = tf.saved_model.signature_def_utils.build_signature_def(
-        inputs=inputs,
-        outputs=outputs,
-        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
-
-    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
-
-    # Save out the SavedModel.
-    builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir)
-    builder.add_meta_graph_and_variables(
-        sess, [tf.saved_model.tag_constants.SERVING],
-        signature_def_map={
-            tf.saved_model.signature_constants.
-            DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                signature
-        },
-        legacy_init_op=legacy_init_op)
-    builder.save()
-
-
-def main(_):
-  # Needed to make sure the logging output is visible.
-  # See https://github.com/tensorflow/tensorflow/issues/3047
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  # Prepare necessary directories that can be used during training
-  prepare_file_system()
-
-  # Gather information about the model architecture we'll be using.
-  model_info = create_model_info(FLAGS.architecture)
-  if not model_info:
-    tf.logging.error('Did not recognize architecture flag')
-    return -1
-
-  # Look at the folder structure, and create lists of all the images.
-  image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage,
-                                   FLAGS.validation_percentage)
-  class_count = len(image_lists.keys())
-  if class_count == 0:
-    tf.logging.error('No valid folders of images found at ' + FLAGS.image_dir)
-    return -1
-  if class_count == 1:
-    tf.logging.error('Only one valid folder of images found at ' +
-                     FLAGS.image_dir +
-                     ' - multiple classes are needed for classification.')
-    return -1
-
-  # See if the command-line flags mean we're applying any distortions.
-  do_distort_images = should_distort_images(
-      FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-      FLAGS.random_brightness)
-
-  # Set up the pre-trained graph.
-  maybe_download_and_extract(model_info['data_url'])
-  graph, bottleneck_tensor, resized_image_tensor = (
-      create_model_graph(model_info))
-
-  # Add the new layer that we'll be training.
-  with graph.as_default():
-    (train_step, cross_entropy, bottleneck_input,
-     ground_truth_input, final_tensor) = add_final_retrain_ops(
-         class_count, FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'],
-         True)
-
-  with tf.Session(graph=graph) as sess:
-    # Set up the image decoding sub-graph.
-    jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(
-        model_info['input_width'], model_info['input_height'],
-        model_info['input_depth'], model_info['input_mean'],
-        model_info['input_std'])
-
-    if do_distort_images:
-      # We will be applying distortions, so setup the operations we'll need.
-      (distorted_jpeg_data_tensor,
-       distorted_image_tensor) = add_input_distortions(
-           FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-           FLAGS.random_brightness, model_info['input_width'],
-           model_info['input_height'], model_info['input_depth'],
-           model_info['input_mean'], model_info['input_std'])
-    else:
-      # We'll make sure we've calculated the 'bottleneck' image summaries and
-      # cached them on disk.
-      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
-                        FLAGS.bottleneck_dir, jpeg_data_tensor,
-                        decoded_image_tensor, resized_image_tensor,
-                        bottleneck_tensor, FLAGS.architecture)
-
-    # Create the operations we need to evaluate the accuracy of our new layer.
-    evaluation_step, _ = add_evaluation_step(final_tensor, ground_truth_input)
-
-    # Merge all the summaries and write them out to the summaries_dir
-    merged = tf.summary.merge_all()
-    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                         sess.graph)
-
-    validation_writer = tf.summary.FileWriter(
-        FLAGS.summaries_dir + '/validation')
-
-    # Create a train saver that is used to restore values into an eval graph
-    # when exporting models.
-    train_saver = tf.train.Saver()
-
-    # Set up all our weights to their initial default values.
-    init = tf.global_variables_initializer()
-    sess.run(init)
-
-    # Run the training for as many cycles as requested on the command line.
-    for i in range(FLAGS.how_many_training_steps):
-      # Get a batch of input bottleneck values, either calculated fresh every
-      # time with distortions applied, or from the cache stored on disk.
-      if do_distort_images:
-        (train_bottlenecks,
-         train_ground_truth) = get_random_distorted_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.image_dir, distorted_jpeg_data_tensor,
-             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
-      else:
-        (train_bottlenecks,
-         train_ground_truth, _) = get_random_cached_bottlenecks(
-             sess, image_lists, FLAGS.train_batch_size, 'training',
-             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-             decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-             FLAGS.architecture)
-      # Feed the bottlenecks and ground truth into the graph, and run a training
-      # step. Capture training summaries for TensorBoard with the `merged` op.
-      train_summary, _ = sess.run(
-          [merged, train_step],
-          feed_dict={bottleneck_input: train_bottlenecks,
-                     ground_truth_input: train_ground_truth})
-      train_writer.add_summary(train_summary, i)
-
-      # Every so often, print out how well the graph is training.
-      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
-      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
-        train_accuracy, cross_entropy_value = sess.run(
-            [evaluation_step, cross_entropy],
-            feed_dict={bottleneck_input: train_bottlenecks,
-                       ground_truth_input: train_ground_truth})
-        tf.logging.info('%s: Step %d: Train accuracy = %.1f%%' %
-                        (datetime.now(), i, train_accuracy * 100))
-        tf.logging.info('%s: Step %d: Cross entropy = %f' %
-                        (datetime.now(), i, cross_entropy_value))
-        # TODO(suharshs): Make this use an eval graph, to avoid quantization
-        # moving averages being updated by the validation set, though in
-        # practice this makes a negligable difference.
-        validation_bottlenecks, validation_ground_truth, _ = (
-            get_random_cached_bottlenecks(
-                sess, image_lists, FLAGS.validation_batch_size, 'validation',
-                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-                decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
-                FLAGS.architecture))
-        # Run a validation step and capture training summaries for TensorBoard
-        # with the `merged` op.
-        validation_summary, validation_accuracy = sess.run(
-            [merged, evaluation_step],
-            feed_dict={bottleneck_input: validation_bottlenecks,
-                       ground_truth_input: validation_ground_truth})
-        validation_writer.add_summary(validation_summary, i)
-        tf.logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-                        (datetime.now(), i, validation_accuracy * 100,
-                         len(validation_bottlenecks)))
-
-      # Store intermediate results
-      intermediate_frequency = FLAGS.intermediate_store_frequency
-
-      if (intermediate_frequency > 0 and (i % intermediate_frequency == 0)
-          and i > 0):
-        # If we want to do an intermediate save, save a checkpoint of the train
-        # graph, to restore into the eval graph.
-        train_saver.save(sess, CHECKPOINT_NAME)
-        intermediate_file_name = (FLAGS.intermediate_output_graphs_dir +
-                                  'intermediate_' + str(i) + '.pb')
-        tf.logging.info('Save intermediate result to : ' +
-                        intermediate_file_name)
-        save_graph_to_file(graph, intermediate_file_name, model_info,
-                           class_count)
-
-    # After training is complete, force one last save of the train checkpoint.
-    train_saver.save(sess, CHECKPOINT_NAME)
-
-    # We've completed all our training, so run a final test evaluation on
-    # some new images we haven't used before.
-    run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
-                   decoded_image_tensor, resized_image_tensor,
-                   bottleneck_tensor)
-
-    # Write out the trained graph and labels with the weights stored as
-    # constants.
-    save_graph_to_file(graph, FLAGS.output_graph, model_info, class_count)
-    with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
-      f.write('\n'.join(image_lists.keys()) + '\n')
-
-    export_model(model_info, class_count, FLAGS.saved_model_dir)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--image_dir',
-      type=str,
-      default='',
-      help='Path to folders of labeled images.'
-  )
-  parser.add_argument(
-      '--output_graph',
-      type=str,
-      default='/tmp/output_graph.pb',
-      help='Where to save the trained graph.'
-  )
-  parser.add_argument(
-      '--intermediate_output_graphs_dir',
-      type=str,
-      default='/tmp/intermediate_graph/',
-      help='Where to save the intermediate graphs.'
-  )
-  parser.add_argument(
-      '--intermediate_store_frequency',
-      type=int,
-      default=0,
-      help="""\
-         How many steps to store intermediate graph. If "0" then will not
-         store.\
-      """
-  )
-  parser.add_argument(
-      '--output_labels',
-      type=str,
-      default='/tmp/output_labels.txt',
-      help='Where to save the trained graph\'s labels.'
-  )
-  parser.add_argument(
-      '--summaries_dir',
-      type=str,
-      default='/tmp/retrain_logs',
-      help='Where to save summary logs for TensorBoard.'
-  )
-  parser.add_argument(
-      '--how_many_training_steps',
-      type=int,
-      default=4000,
-      help='How many training steps to run before ending.'
-  )
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='How large a learning rate to use when training.'
-  )
-  parser.add_argument(
-      '--testing_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a test set.'
-  )
-  parser.add_argument(
-      '--validation_percentage',
-      type=int,
-      default=10,
-      help='What percentage of images to use as a validation set.'
-  )
-  parser.add_argument(
-      '--eval_step_interval',
-      type=int,
-      default=10,
-      help='How often to evaluate the training results.'
-  )
-  parser.add_argument(
-      '--train_batch_size',
-      type=int,
-      default=100,
-      help='How many images to train on at a time.'
-  )
-  parser.add_argument(
-      '--test_batch_size',
-      type=int,
-      default=-1,
-      help="""\
-      How many images to test on. This test set is only used once, to evaluate
-      the final accuracy of the model after training completes.
-      A value of -1 causes the entire test set to be used, which leads to more
-      stable results across runs.\
-      """
-  )
-  parser.add_argument(
-      '--validation_batch_size',
-      type=int,
-      default=100,
-      help="""\
-      How many images to use in an evaluation batch. This validation set is
-      used much more often than the test set, and is an early indicator of how
-      accurate the model is during training.
-      A value of -1 causes the entire validation set to be used, which leads to
-      more stable results across training iterations, but may be slower on large
-      training sets.\
-      """
-  )
-  parser.add_argument(
-      '--print_misclassified_test_images',
-      default=False,
-      help="""\
-      Whether to print out a list of all misclassified test images.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='/tmp/imagenet',
-      help="""\
-      Path to classify_image_graph_def.pb,
-      imagenet_synset_to_human_label_map.txt, and
-      imagenet_2012_challenge_label_map_proto.pbtxt.\
-      """
-  )
-  parser.add_argument(
-      '--bottleneck_dir',
-      type=str,
-      default='/tmp/bottleneck',
-      help='Path to cache bottleneck layer values as files.'
-  )
-  parser.add_argument(
-      '--final_tensor_name',
-      type=str,
-      default='final_result',
-      help="""\
-      The name of the output classification layer in the retrained graph.\
-      """
-  )
-  parser.add_argument(
-      '--flip_left_right',
-      default=False,
-      help="""\
-      Whether to randomly flip half of the training images horizontally.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--random_crop',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much of a margin to randomly crop off the
-      training images.\
-      """
-  )
-  parser.add_argument(
-      '--random_scale',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly scale up the size of the
-      training images by.\
-      """
-  )
-  parser.add_argument(
-      '--random_brightness',
-      type=int,
-      default=0,
-      help="""\
-      A percentage determining how much to randomly multiply the training image
-      input pixels up or down by.\
-      """
-  )
-  parser.add_argument(
-      '--architecture',
-      type=str,
-      default='inception_v3',
-      help="""\
-      Which model architecture to use. 'inception_v3' is the most accurate, but
-      also the slowest. For faster or smaller models, chose a MobileNet with the
-      form 'mobilenet_<parameter size>_<input_size>[_quantized]'. For example,
-      'mobilenet_1.0_224' will pick a model that is 17 MB in size and takes 224
-      pixel input images, while 'mobilenet_0.25_128_quantized' will choose a much
-      smaller and less accurate model, taking 128x128 images, and instrumented
-      for eventual quantization via TensorFlow Lite.
-      See https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
-      for more information on Mobilenet.\
-      """)
-  parser.add_argument(
-      '--saved_model_dir',
-      type=str,
-      default='/tmp/saved_models/1/',
-      help='Where to save the exported graph.')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
deleted file mode 100644
index fb7324c58ac1be60baad840207f31a61ec6182be..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-bad-import-order,unused-import
-"""Tests the graph freezing tool."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import os
-
-from tensorflow.examples.image_retraining import retrain
-from tensorflow.python.framework import test_util
-
-
-class ImageRetrainingTest(test_util.TensorFlowTestCase):
-
-  def dummyImageLists(self):
-    return {'label_one': {'dir': 'somedir', 'training': ['image_one.jpg',
-                                                         'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']},
-            'label_two': {'dir': 'otherdir', 'training': ['image_one.jpg',
-                                                          'image_two.jpg'],
-                          'testing': ['image_three.jpg', 'image_four.jpg'],
-                          'validation': ['image_five.jpg', 'image_six.jpg']}}
-
-  def testGetImagePath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('image_dir/somedir/image_one.jpg', retrain.get_image_path(
-        image_lists, 'label_one', 0, 'image_dir', 'training'))
-    self.assertEqual('image_dir/otherdir/image_four.jpg',
-                     retrain.get_image_path(image_lists, 'label_two', 1,
-                                            'image_dir', 'testing'))
-
-  def testGetBottleneckPath(self):
-    image_lists = self.dummyImageLists()
-    self.assertEqual('bottleneck_dir/somedir/image_five.jpg_imagenet_v3.txt',
-                     retrain.get_bottleneck_path(
-                         image_lists, 'label_one', 0, 'bottleneck_dir',
-                         'validation', 'imagenet_v3'))
-
-  def testShouldDistortImage(self):
-    self.assertEqual(False, retrain.should_distort_images(False, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(True, 0, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 10, 0, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 1, 0))
-    self.assertEqual(True, retrain.should_distort_images(False, 0, 0, 50))
-
-  def testAddInputDistortions(self):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        retrain.add_input_distortions(True, 10, 10, 10, 299, 299, 3, 128, 128)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortJPGInput:0'))
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortResult:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalRetrainOps(self, flags_mock):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, False,
-                                      False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalRetrainOpsQuantized(self, flags_mock):
-    # Ensure that the training and eval graph for quantized models are correctly
-    # created.
-    with tf.Graph().as_default() as g:
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization, set is_training to
-        # true.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, True, True)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-        found_fake_quant = 0
-        for op in g.get_operations():
-          if op.type == 'FakeQuantWithMinMaxVars':
-            found_fake_quant += 1
-            # Ensure that the inputs of each FakeQuant operations has 2 Assign
-            # operations in the training graph (Assign[Min,Max]Last,
-            # Assign[Min,Max]Ema)
-            self.assertEqual(2,
-                             len([i for i in op.inputs if 'Assign' in i.name]))
-        self.assertEqual(found_fake_quant, 2)
-    with tf.Graph().as_default() as g:
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization, set is_training to
-        # false.
-        retrain.add_final_retrain_ops(5, 'final', bottleneck, 1024, True, False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-        found_fake_quant = 0
-        for op in g.get_operations():
-          if op.type == 'FakeQuantWithMinMaxVars':
-            found_fake_quant += 1
-            for i in op.inputs:
-              # Ensure that no operations are Assign operation since this is the
-              # evaluation graph.
-              self.assertTrue('Assign' not in i.name)
-        self.assertEqual(found_fake_quant, 2)
-
-  def testAddEvaluationStep(self):
-    with tf.Graph().as_default():
-      final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.int64, [1], name='gt')
-      self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
-
-  def testAddJpegDecoding(self):
-    with tf.Graph().as_default():
-      jpeg_data, mul_image = retrain.add_jpeg_decoding(10, 10, 3, 0, 255)
-      self.assertIsNotNone(jpeg_data)
-      self.assertIsNotNone(mul_image)
-
-  def testCreateModelInfo(self):
-    did_raise_value_error = False
-    try:
-      retrain.create_model_info('no_such_model_name')
-    except ValueError:
-      did_raise_value_error = True
-    self.assertTrue(did_raise_value_error)
-    model_info = retrain.create_model_info('inception_v3')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(299, model_info['input_width'])
-
-  def testCreateModelInfoQuantized(self):
-    # Test for mobilenet_quantized
-    model_info = retrain.create_model_info('mobilenet_1.0_224')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(224, model_info['input_width'])
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index e7db9cddf02daf9a32d3ed859ee9bd35b2cae838..63dd18457fea42acb09058b9ddd4623d72d1fd04 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -457,7 +457,7 @@ class AudioProcessor(object):
           self.time_shift_offset_placeholder_: time_shift_offset,
       }
       # Choose a section of background noise to mix in.
-      if use_background:
+      if use_background or sample['label'] == SILENCE_LABEL:
         background_index = np.random.randint(len(self.background_data))
         background_samples = self.background_data[background_index]
         background_offset = np.random.randint(
@@ -465,7 +465,9 @@ class AudioProcessor(object):
         background_clipped = background_samples[background_offset:(
             background_offset + desired_samples)]
         background_reshaped = background_clipped.reshape([desired_samples, 1])
-        if np.random.uniform(0, 1) < background_frequency:
+        if sample['label'] == SILENCE_LABEL:
+          background_volume = np.random.uniform(0, 1)
+        elif np.random.uniform(0, 1) < background_frequency:
           background_volume = np.random.uniform(0, background_volume_range)
         else:
           background_volume = 0
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 0fd2177df7c4a79e12de58d377834915f7355532..1d5ebf6687fb944e4206b88dda9d979abd6bd804 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1845,81 +1845,97 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
 // For example:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
 // ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			input,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_idx"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// Finds unique elements in a 1-D tensor.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// That is for rows we have grad for, we update var and accum as follows:
+// For example:
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1928,118 +1944,180 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "UniqueV2",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a sequence of numbers.
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
 // For example:
 //
 // ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 // ```
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//	x: 1-D.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "Unique",
 		Input: []tf.Input{
-			start, limit, delta,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Shuffle dimensions of x according to a permutation and conjugate the result.
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "ConjugateTranspose",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Reshapes a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
 //
-// Arguments:
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// For example:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "Reshape",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			tensor, shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// Checks a tensor for NaN and Inf values.
+//
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			handle,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -2047,104 +2125,81 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Returns the complex conjugate of a complex number.
 //
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
 // ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "Conj",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2153,80 +2208,81 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Creates a sequence of numbers.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "Range",
 		Input: []tf.Input{
-			arr, size, weights,
+			start, limit, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Computes gradients for SparseSegmentSqrtN.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 // dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
@@ -2234,12 +2290,12 @@ func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (b
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
 			data, indices, segment_ids,
 		},
@@ -2248,171 +2304,109 @@ func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_i
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Pop the element at the top of the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along segments of a tensor.
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// For example:
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
 //
 // Arguments:
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
 
-// Assign `value` to the sliced l-value reference of `ref`.
-//
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-//
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
-		Input: []tf.Input{
-			ref, begin, end, strides, value,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["message"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// An identity op that triggers an error if a gradient is requested.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
 // Arguments:
+//	input: any tensor.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2421,9 +2415,9 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -2431,17 +2425,13 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
@@ -2450,39 +2440,53 @@ func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Converts a sparse representation into a dense tensor.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// Builds an array `dense` with shape `output_shape` such that
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2491,9 +2495,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			a, b,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
@@ -2501,284 +2505,475 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
+// Computes the sum along sparse segments of a tensor.
 //
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // For example:
 //
 // ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
 //
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
 // ```
 //
 // Arguments:
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			condition, x, y,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "Sinh",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Relu6",
 		Input: []tf.Input{
-			a, b, x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Computes the sum along segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
 //
-// The upper regularized incomplete Gamma function is defined as:
+// `num_segments` should equal the number of distinct segment IDs.
 //
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
-// where
+// Arguments:
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
 //
-// is the upper incomplete Gama function.
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			a, x,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
 // If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
 // If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["end_mask"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			true_classes,
+			ref, begin, end, strides, value,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			x, y,
+			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Forwards the input to the output.
+// Returns which elements of x are finite.
 //
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "Select",
 		Input: []tf.Input{
-			input,
+			condition, x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -3257,14 +3452,77 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
@@ -4419,6 +4677,66 @@ func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvertPermutation",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of (x <= y) element-wise.
 //
 // *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
@@ -4614,6 +4932,70 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
@@ -5657,66 +6039,6 @@ func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BiasAddGradAttr is an optional argument to BiasAddGrad.
 type BiasAddGradAttr func(optionalAttr)
 
@@ -5919,107 +6241,64 @@ func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["input_min"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["input_max"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -6027,251 +6306,280 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// Returns locations of nonzero / true values in a tensor.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
+// For example:
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "Where",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			condition,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			input, axis,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
 	}
+	return components
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "Erf",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			x,
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "Floor",
 		Input: []tf.Input{
-			shape,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// OneHotAxis sets the optional axis attribute to value.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["axis"] = value
 	}
 }
 
-// Asserts that the given condition is true.
+// Returns a one-hot tensor.
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6280,79 +6588,67 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "OneHot",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			indices, depth, on_value, off_value,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -6362,167 +6658,185 @@ func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf
 	}
 	var idx int
 	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
 		return
 	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
 		return
 	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
 		return
 	}
-	return output_indices, output_values, output_shape
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Use RandomPoissonV2 instead.
+// Real-valued fast Fourier transform.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "RFFT",
 		Input: []tf.Input{
-			shape, rate,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Associates the given iterator with the given statistics aggregator.
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
-		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
-		},
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["bias"] = value
 	}
 }
 
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["alpha"] = value
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// one in the source data format.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6531,9 +6845,9 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			x,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
@@ -6541,41 +6855,70 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "Any",
 		Input: []tf.Input{
-			x,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
 // accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 // quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 // var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 // accum = accum_new
@@ -6585,14 +6928,13 @@ func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 //	accum: Should be from a Variable().
 //	linear: Should be from a Variable().
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
 //	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6601,100 +6943,62 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Outputs random values from a uniform distribution.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
 //	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	dtype: The type of the output.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			shape, seed,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -6702,129 +7006,158 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["summarize"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// Asserts that the given condition is true.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "Assert",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
 //
-// For example:
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "PopulationCount",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			input,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["seed"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6833,9 +7166,9 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			real, imag,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -6843,42 +7176,46 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	x: 1-D.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6887,41 +7224,63 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			x,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// Associates the given iterator with the given statistics aggregator.
+//
+// Returns the created operation.
+func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorSetStatsAggregator",
+		Input: []tf.Input{
+			iterator_handle, stats_aggregator_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["src_format"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// one in the source data format.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6930,9 +7289,9 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			shape, seed,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -6940,70 +7299,86 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
+// Reads the value of a variable.
 //
-// Arguments:
+// The tensor returned by this operation is immutable.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			resource,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7012,165 +7387,112 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			params, indices,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Returns which elements of x are Inf.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "IsInf",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// N is the size of the segment being reduced.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			input, fft_length,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
-//
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
-	return func(m optionalAttr) {
-		m["skip_empty"] = value
-	}
-}
-
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
-//
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			input, delimiter,
+			a_indices, a_values, a_shape, b,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["dtype"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// Input images can be of different types but output images are always float.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7179,9 +7501,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			images, size,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -7189,168 +7511,140 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
 // If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
+// Restores a tensor from checkpoint files.
 //
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// For example:
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "Imag",
 		Input: []tf.Input{
-			resource, indices, updates,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// Converts two real numbers to a complex number.
 //
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "Complex",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
@@ -7358,184 +7652,113 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["dtype"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// The generated values will have mean 0 and standard deviation 1.
 //
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
-
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// ```
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			tensor, shape, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
+// Gather slices from `params` according to `indices`.
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
 // ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
 // ```
 //
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7544,9 +7767,9 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "Gather",
 		Input: []tf.Input{
-			x, axis,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
@@ -7554,577 +7777,479 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+		m["align_corners"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "ResizeBilinear",
+		Input: []tf.Input{
+			images, size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "Softsign",
 		Input: []tf.Input{
-			string_tensor,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
+
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+	return func(m optionalAttr) {
+		m["old_vocab_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// Arguments:
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
 //
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+//
+// Arguments:
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
+//
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			input_dataset, count,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// The gradient operator for the SparseAdd op.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
 //
 // Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			audio, sample_rate,
+			orig_input_shape, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
-	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
-		Input: []tf.Input{
-			data, partitions,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return outputs
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+// Op removes all elements in the underlying container.
 //
 // Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
+		Type: "StageClear",
+
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["seed"] = value
 	}
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// the source data format.
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			x,
+			true_classes, sampled_candidates,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8133,213 +8258,176 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// CumprodExclusive sets the optional exclusive attribute to value.
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["footer_bytes"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
-		Input: []tf.Input{
-			x, axis,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+// The hash function is deterministic on the content of the string within the
+// process.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//	num_buckets: The number of buckets.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			data, segment_ids,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Computes gradients for the exponential linear (Elu) operation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			gradients, outputs,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
 // Arguments:
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -8347,176 +8435,109 @@ func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, o
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
+// The gradient operator for the SparseAdd op.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "SparseAddGrad",
 		Input: []tf.Input{
-			input,
+			backprop_val_grad, a_indices, b_indices, sum_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "Atan",
 		Input: []tf.Input{
-			start, stop, num,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// Encode audio data using the WAV file format.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	resource: handle to the resource to delete.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			resource,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
 			input,
 		},
@@ -8526,153 +8547,170 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes numerical negative value element-wise.
+//
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "Neg",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Execute a sub graph on a remote processor.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
 }
 
-// 2D real-valued fast Fourier transform.
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			input, fft_length,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			images, size,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -8680,91 +8718,38 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
-//
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8773,9 +8758,9 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			shape, seed,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -8783,255 +8768,260 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			data,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
 //
 // For example:
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
 // ```
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			input,
+			data, partitions,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// Update '*var' according to the adagrad scheme.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise XOR of `x` and `y`.
+// Return the shape of s0 op s1 with broadcast.
 //
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			x, y,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// then the final deserialized `SparseTensor` will be:
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// the source data format.
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			serialized_sparse,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Update '*var' according to the AddSign update.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9040,77 +9030,100 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// Computes the mean along segments of a tensor.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// variable according to `indices`.
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
+// Arguments:
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// ```python
-//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// The resulting update to ref would look like this:
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
-// See @{tf.scatter_nd} for more details about how to make updates to
-// slices.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9119,109 +9132,123 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			ref, indices, updates,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// Arguments:
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
+// Inverse fast Fourier transform.
 //
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
-// For example:
+// Arguments:
+//	input: A complex64 tensor.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-// Or, to remove specific size 1 dimensions:
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
 //
 // ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
 // ```
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			input,
+			start, stop, num,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// Deletes the resource specified by the handle.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+//	resource: handle to the resource to delete.
 //
 // Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9230,58 +9257,75 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
 // If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+func LRNBeta(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["beta"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Local Response Normalization.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9290,9 +9334,9 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "LRN",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -9300,16 +9344,16 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorDataset",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			tf.OutputList(input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -9317,78 +9361,36 @@ func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shap
 	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
+// value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9397,168 +9399,87 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// 2D real-valued fast Fourier transform.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			x, y,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["align_corners"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Resize `images` to `size` using area interpolation.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// Input images can be of different types but output images are always float.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9567,103 +9488,112 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Pads a tensor with zeros.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// The padded size of each dimension D of the output is:
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "Pad",
 		Input: []tf.Input{
-			input,
+			input, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Checks whether a resource handle-based variable has been initialized.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			x, y,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["dtype"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// All elements selected by `indices` must have the same shape.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -9671,97 +9601,80 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
+// Makes its input available to the next iteration.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			x, y,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
-//
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
-//
-// Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
-//
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
-		},
-		Attrs: attrs,
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
 	}
-	return tensors
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Returns the argument of a complex number.
 //
-// Arguments:
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
+// For example:
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "Angle",
 		Input: []tf.Input{
-			input_dataset, count,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -9769,134 +9682,174 @@ func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Elementwise computes the bitwise XOR of `x` and `y`.
+//
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Deserialize `SparseTensor` objects.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// Arguments:
-//	input: Base64 strings to decode.
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Store the input tensor in the state of the current session.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// Arguments:
-//	value: The tensor to be stored.
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			value,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Update '*var' according to the RMSProp algorithm.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9905,157 +9858,77 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
+// Applies sparse `updates` to individual values or slices within a given
 //
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
+// variable according to `indices`.
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// Examples
-// =========
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
 //
-// Suppose that
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
 // ```
 //
-// Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
 //
+// ```python
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
 // ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-//
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
 //
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// The resulting update to ref would look like this:
 //
-// Then output is `[2 x 2 x 3]`:
+//     [1, 11, 3, 10, 9, 6, 7, 12]
 //
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10064,84 +9937,70 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			resource,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["squeeze_dims"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -10149,43 +10008,98 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Update '*var' according to the adadelta scheme.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+//
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10194,9 +10108,9 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -10204,166 +10118,265 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			input, fft_length,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Returns the truth value of (x > y) element-wise.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["seed"] = value
 	}
 }
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["seed2"] = value
 	}
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["min_object_covered"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
-// Accepted values are:
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10372,128 +10385,85 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			contents,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return op.Output(0)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// All elements selected by `indices` must have the same shape.
 //
-// For example:
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -10501,165 +10471,232 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Returns x / y element-wise for integer types.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// Inputs are the logits, not probabilities.
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			features, labels,
+			prefix, tensor_names, shape_and_slices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			serialized,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["Targmax"] = value
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Decode web-safe base64-encoded strings.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: Base64 strings to decode.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
 // value: If True, updating of the var and accum tensors will be protected by
 // a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	accum: Should be from a Variable().
 //	lr: Learning rate. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10668,133 +10705,99 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["data_format"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Computes gradients of max pooling function.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10803,9 +10806,9 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			image,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -10813,48 +10816,28 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
 
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["out_type"] = value
 	}
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+// For example:
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10863,9 +10846,9 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			logits, num_samples,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -10873,183 +10856,194 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			x,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// 3D real-valued fast Fourier transform.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input, fft_length,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+// Fast Fourier transform.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+// Arguments:
+//	input: A complex64 tensor.
 //
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			size,
+			serialized,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Targmax"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11058,75 +11052,45 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
 // value: If True, updating of the var and accum tensors will be protected by
 // a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11135,201 +11099,193 @@ func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// value: Per pixel image format.
 // If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+func EncodeJpegFormat(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["format"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["quality"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
 // If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["progressive"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["optimize_size"] = value
 	}
 }
 
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
+}
 
-		Attrs: attrs,
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
 //
-// This operation creates a tensor of shape `dims` and fills it with `value`.
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
 //
-// For example:
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
 //
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
 //
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D fast Fourier transform.
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
 //
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			input,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11338,154 +11294,223 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			y, dy,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// 3D real-valued fast Fourier transform.
 //
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			handle,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
 //
-// Inputs are the logits, not probabilities.
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			features, labels,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["fast"] = value
 	}
 }
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
+// Solves one or more linear least-squares problems.
 //
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins a string Tensor across the given dimensions.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// For example:
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
-// ```
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11494,9 +11519,9 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -11504,199 +11529,130 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["data_format"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
+	return op.Output(0)
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			input,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11705,95 +11661,104 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			input, axis,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 // If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
+// Inverse 2D fast Fourier transform.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
+// Equivalent to np.fft.ifft2
 // @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "IFFT2D",
 		Input: []tf.Input{
 			input,
 		},
@@ -11802,126 +11767,95 @@ func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// Creates a tensor filled with a scalar value.
 //
-// This is a deprecated version of BiasAdd and will be soon removed.
+// This operation creates a tensor of shape `dims` and fills it with `value`.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "Fill",
 		Input: []tf.Input{
-			value, bias,
+			dims, value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// 2D fast Fourier transform.
 //
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+//	input: A complex64 tensor.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			tensor, axis,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
-// For example:
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11930,437 +11864,335 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			input,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			tag, tensor,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			input,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "Sub",
 		Input: []tf.Input{
-			input_dataset, tag,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the mean of elements across dimensions of a tensor.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input, axis,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["separator"] = value
 	}
 }
 
-// Initializes a table from a text file.
+// Joins a string Tensor across the given dimensions.
 //
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			table_handle, filename,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "Cos",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
-
-// PackAxis sets the optional axis attribute to value.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["is_training"] = value
 	}
 }
 
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
+// Gradient for batch normalization.
 //
-// This is the opposite of `unpack`.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12369,96 +12201,119 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
 //
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Reordering does not affect the shape of the SparseTensor.
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "TopK",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "Zeta",
 		Input: []tf.Input{
-			features,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Computes the product of elements across dimensions of a tensor.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12467,61 +12322,68 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "Prod",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -12529,86 +12391,126 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Adds `bias` to `value`.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "ReverseV2",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			tensor, axis,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// Returns the real part of a complex number.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// For example:
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12617,33 +12519,65 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "Real",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// Arguments:
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
 //
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			input_dataset, count,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -12651,65 +12585,42 @@ func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, outpu
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Computes the QR decompositions of one or more matrices.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12718,70 +12629,94 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "Qr",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "BytesProducedStatsDataset",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			input_dataset, tag,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			filename,
+			var_, alpha, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// Computes the mean of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -12794,7 +12729,7 @@ func MinKeepDims(value bool) MinAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12803,7 +12738,7 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "Mean",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -12813,233 +12748,208 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// Computes sigmoid of `x` element-wise.
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["delimiter"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// Initializes a table from a text file.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			shape,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// PackAxis sets the optional axis attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["axis"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
+//
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
+//
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	values: Must be of same shape and type.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13048,9 +12958,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "Pack",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -13058,74 +12968,86 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Concat",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
+// Update '*var' according to the AddSign update.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13134,56 +13056,61 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -13191,145 +13118,121 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			x,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the (key, value) element with the smallest
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			indices,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -13337,62 +13240,76 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -13400,102 +13317,55 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			filename,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acosh",
+		Type: "Sigmoid",
 		Input: []tf.Input{
 			x,
 		},
@@ -13504,274 +13374,305 @@ func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["epsilon"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["data_format"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
 // If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["is_training"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Batch normalization.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			size,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
+		m["seed"] = value
 	}
 }
 
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["na_value"] = value
+		m["seed2"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
+// Outputs random values from a normal distribution.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["overlapping"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes gradient of the FractionalAvgPool function.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
+//
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op removes all elements in the underlying container.
+// Concatenates tensors along one dimension.
 //
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
+		Type: "Concat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values),
+		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_locking"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// want to use Nesterov momentum.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			true_classes,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
 
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -13779,34 +13680,36 @@ type MaxPoolV2Attr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: 4-D input to pool over.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
 //	ksize: The size of the window for each dimension of the input tensor.
 //	strides: The stride of the sliding window for each dimension of the
 // input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			input, ksize, strides,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -13814,146 +13717,75 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
-
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// Returns element-wise integer closest to x.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
-//
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "Rint",
 		Input: []tf.Input{
-			empty_key,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// StageSizeCapacity sets the optional capacity attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// StageSizeContainer sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+// Op removes and returns the (key, value) element with the smallest
+//
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13962,312 +13794,399 @@ func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (s
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
-
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "Acosh",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
-//
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			reader_handle,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// Arguments:
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			data, segment_ids,
+			records, tf.OutputList(record_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return output
 }
 
-// Computes gradients for SparseSegmentMean.
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "MapClear",
+
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv3DDilations sets the optional dilations attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Our Conv3D implements a form of cross-correlation.
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			input, filter,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -14275,156 +14194,118 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			empty_key,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "TruncateMod",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -14433,395 +14314,541 @@ func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Inverse 2D real-valued fast Fourier transform.
 //
-// Arguments:
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["channels"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// DecodeJpegRatio sets the optional ratio attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["ratio"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// Arguments:
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["capacity"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
+		Type: "StageSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			handle,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "Softplus",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
-
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Computes exponential of x - 1 element-wise.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op peeks at the values at the specified key.  If the
+// Computes the sum along segments of a tensor.
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			key, indices,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+// Computes gradients for SparseSegmentMean.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Returns the set of files matching one or more glob patterns.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Returns the truth value of (x >= y) element-wise.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["data_format"] = value
 	}
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["dilations"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// Accepted values are:
+// Our Conv3D implements a form of cross-correlation.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	handle: The handle to a TensorArray.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			contents, crop_window,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -14829,71 +14856,139 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["mode"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			true_classes,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -14901,344 +14996,485 @@ func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+// Returns the truth value of (x < y) element-wise.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "Less",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// Arguments:
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			x, y,
+			features, max_value, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["out_type"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["dilations"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
 //
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
-// and
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// then the final deserialized `SparseTensor` will be:
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			serialized_sparse,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["capacity"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// with the given separator (default is an empty separator).
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
 }
 
-// Returns immutable tensor from memory region.
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// The current implementation memmaps the tensor from a file.
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
 //
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
+}
 
-		Attrs: attrs,
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+// Accepted values are:
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			input, fft_length,
+			contents, crop_window,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// then the output will be
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Graphically this is equivalent to doing
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -15246,358 +15482,415 @@ func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// Saves the input tensors to disk.
 //
-// if hashed_output=true then the output will be
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// See also `SaveSlices`.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Save",
+		Input: []tf.Input{
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			a_indices, a_values, a_shape, b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// For example, if the input is
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Graphically the output tensors are:
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
-// Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// Returns immutable tensor from memory region.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// Inverse real-valued fast Fourier transform.
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
+// Equivalent to np.fft.irfft
 // @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// For example, if the inputs are
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// if hashed_output=true then the output will be
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
 		Attrs: attrs,
 	}
@@ -15605,71 +15898,75 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes numerical negative value element-wise.
+// Concatenates quantized tensors along one dimension.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			x,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// For example, if the input is
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			inputs, min, max,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Returns the element-wise min of two SparseTensors.
@@ -16533,30 +16830,6 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ShapeAttr is an optional argument to Shape.
 type ShapeAttr func(optionalAttr)
 
@@ -16597,6 +16870,30 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 	return op.Output(0)
 }
 
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes fingerprints of the input strings.
 //
 // Arguments:
@@ -16856,19 +17153,83 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
 }
 
 // Adds sparse updates to the variable referenced by `resource`.
@@ -16951,6 +17312,47 @@ func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (pr
 	return op.Output(0)
 }
 
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
+//
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LoopCond",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -17114,211 +17516,14 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
-	}
-}
-
-// Use QuantizeAndDequantizeV2 instead.
-//
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns locations of nonzero / true values in a tensor.
-//
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
-//
-// For example:
-//
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
-//
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Where",
-		Input: []tf.Input{
-			condition,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			handle,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
 // RandomUniformIntAttr is an optional argument to RandomUniformInt.
@@ -17816,6 +18021,197 @@ func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_uppe
 	return op.Output(0)
 }
 
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
 type QuantizedMatMulAttr func(optionalAttr)
 
@@ -18823,6 +19219,34 @@ func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Deprecated. Disallowed in GraphDef version >= 2.
 //
 // DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
@@ -21902,80 +22326,64 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// For example:
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
+// **A note about the input flow_in:**
 //
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
 //
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
 //
-// # -1 can also be used to infer the shape
+// **A note about the source attribute:**
 //
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
 //
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
 //
-// Arguments:
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
 //
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			tensor, shape,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Creates a dataset that splits a SparseTensor into elements row-wise.
@@ -22281,70 +22689,6 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Return substrings from `Tensor` of strings.
 //
 // For each string in the input `Tensor`, creates a substring starting at index
@@ -24260,66 +24604,6 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
-//
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
-//
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
 // Each comparison returns a boolean `true` (if `input_value > threshold`)
@@ -26950,81 +27234,29 @@ func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
 		m["desired_samples"] = value
 	}
 }
-
-// Decode a 16-bit PCM WAV file to a float tensor.
-//
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-//
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
-//
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
-//
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
-//
-// Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
-//
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeWav",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
+
+// Decode a 16-bit PCM WAV file to a float tensor.
 //
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
 //
-// For example:
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
 //
 // Arguments:
-//	x: 1-D.
+//	contents: The WAV-encoded audio, usually from a file.
 //
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27033,9 +27265,9 @@ func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unique",
+		Type: "DecodeWav",
 		Input: []tf.Input{
-			x,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -27663,227 +27895,3 @@ func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Outp
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Eagerly executes a python function to compute func(input)->output. The
-//
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
-}
-
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StopGradient",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
-//
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
-//
-// Arguments:
-//	input: any tensor.
-//
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PreventGradient",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation and conjugate the result.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
-
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueV2",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 1be4c838f3526bcdf32d8bda5a1ada776a8c1b21..ab7d698a45b7fc0cd498f8367fc1cecf07e4ba3c 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -312,7 +312,7 @@ tf_cc_test(
         "src/gen/cc/source_writer_test.cc",
     ],
     data = [
-        "src/gen/resources/test.snippet.java",
+        "src/gen/resources/test.java.snippet",
     ],
     deps = [
         ":java_op_gen_lib",
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 214999af9a6f9ee244d336a64830238e6b7ea872..a02f75ad6e7f5f1a9f22ad976e488ae5bf02a731 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -23,10 +23,20 @@ namespace tensorflow {
 namespace java {
 
 SourceWriter::SourceWriter() {
-  // push an empty generic namespace at start, for simplification
+  // Push an empty generic namespace at start, for simplification.
   generic_namespaces_.push(new GenericNamespace());
 }
 
+SourceWriter::~SourceWriter() {
+  // Remove empty generic namespace added at start as well as any other
+  // namespace objects that haven't been removed.
+  while (!generic_namespaces_.empty()) {
+    GenericNamespace* generic_namespace = generic_namespaces_.top();
+    generic_namespaces_.pop();
+    delete generic_namespace;
+  }
+}
+
 SourceWriter& SourceWriter::Indent(int tab) {
   left_margin_.resize(
       std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index 6abe13b5d217b30d826d013e14a590eeb91719fb..637072c0df1c8dd0c21888f3ec95a259074c3182 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -32,7 +32,8 @@ namespace java {
 class SourceWriter {
  public:
   SourceWriter();
-  virtual ~SourceWriter() = default;
+
+  virtual ~SourceWriter();
 
   // Indents following lines with white spaces.
   //
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index 6926a5a411d070e25f2382c72589d879d3ca2180..4bce2fea7040a0e5cb9256dc2672399c3af8a03d 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -259,7 +259,9 @@ TEST(StreamTest, Types) {
 
 TEST(StreamTest, FileSnippet) {
   SourceBufferWriter writer;
-  const string& fname = "tensorflow/java/src/gen/resources/test.snippet.java";
+  const string fname = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "java/src/gen/resources/test.java.snippet");
 
   writer.WriteFromFile(fname)
         .BeginBlock()
diff --git a/tensorflow/java/src/gen/resources/test.snippet.java b/tensorflow/java/src/gen/resources/test.java.snippet
similarity index 100%
rename from tensorflow/java/src/gen/resources/test.snippet.java
rename to tensorflow/java/src/gen/resources/test.java.snippet
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 489e95c3102557d7a75d83789c46106aa5aa3ed4..3948991c84d35009217f7c05844551fdcc49fb22 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -101,6 +101,7 @@ public class LabelImage {
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
+        // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
         return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
@@ -110,6 +111,7 @@ public class LabelImage {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
+          // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
           Tensor<Float> result =
               s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 1225786812db05a9f45fb040a7c9190da0c4d81b..2890a81479f4e51f7206728bd3294936ffa5a24a 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1617,7 +1617,10 @@ py_library(
 
 py_library(
     name = "array_ops",
-    srcs = ["ops/array_ops.py"],
+    srcs = [
+        "ops/array_ops.py",
+        "ops/inplace_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops_gen",
@@ -2287,7 +2290,6 @@ py_library(
         ":clip_ops",
         ":framework_for_generated_wrappers",
         ":init_ops",
-        ":layers_base",
         ":math_ops",
         ":nn_ops",
         ":partitioned_variables",
@@ -2564,6 +2566,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "summary_ops_v2",
+    srcs = ["ops/summary_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":smart_cond",
+        ":summary_op_util",
+        ":summary_ops_gen",
+        ":training_util",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "template",
     srcs = ["ops/template.py"],
@@ -2926,11 +2952,15 @@ py_library(
     name = "training",
     srcs = glob(
         ["training/**/*.py"],
-        exclude = ["**/*test*"],
+        exclude = [
+            "**/*test*",
+            "training/training_util.py",  # See :training_util
+        ],
     ),
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":checkpoint_ops_gen",
         ":client",
         ":control_flow_ops",
@@ -2942,6 +2972,7 @@ py_library(
         ":framework_ops",
         ":gradients",
         ":init_ops",
+        ":distribute",
         ":io_ops",
         ":io_ops_gen",
         ":layers_base",
@@ -2960,14 +2991,18 @@ py_library(
         ":string_ops",
         ":summary",
         ":training_ops_gen",
+        ":training_util",
         ":util",
         ":variable_scope",
         ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        # `layers` dependency only exists due to the use of a small utility.
+        "//tensorflow/python/keras:layers",
     ],
 )
 
@@ -2995,6 +3030,68 @@ py_test(
     ],
 )
 
+py_library(
+    name = "device_util",
+    srcs = ["training/device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device",
+        ":framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["training/distribute.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":device_util",
+        ":framework_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["training/checkpointable_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":checkpointable",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":framework_test_lib",
+        ":init_ops",
+        ":resource_variable_ops",
+        ":session",
+        ":state_ops",
+        ":template",
+        ":training",
+        ":training_util",
+        ":variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "distribute_test",
     size = "small",
@@ -3002,7 +3099,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":client_testlib",
-        ":training",
+        ":distribute",
         ":variable_scope",
     ],
 )
@@ -3384,6 +3481,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
@@ -4206,6 +4304,25 @@ py_test(
     ],
 )
 
+py_library(
+    name = "training_util",
+    srcs = ["training/training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework",
+        ":framework_ops",
+        ":init_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_test(
     name = "training_util_test",
     size = "small",
@@ -4216,6 +4333,7 @@ py_test(
         ":framework",
         ":platform",
         ":training",
+        ":training_util",
         ":variables",
     ],
 )
@@ -4245,6 +4363,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":distribute",
         ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
@@ -4260,6 +4379,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":client",
         ":constant_op",
         ":errors",
         ":framework",
@@ -4272,6 +4392,7 @@ py_library(
         ":summary_op_util",
         ":summary_ops",
         ":summary_ops_gen",
+        ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -4298,7 +4419,7 @@ py_tests(
         ":platform",
         ":platform_test",
         ":summary",
-        ":training",
+        ":summary_ops_v2",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -4323,6 +4444,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
 )
@@ -4359,6 +4481,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index ab1d01a8351d63544b2c612ad228515d48975aca..13f8420a670fe64615037975139f3ee1f16820b6 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -149,6 +149,19 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.framework.ops import enable_eager_execution
 
+# Necessary for the symbols in this module to be taken into account by
+# the namespace management system (API decorators).
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+
+# Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
+# (due to a circular dependency issue: rnn depends on layers).
+nn.dynamic_rnn = rnn.dynamic_rnn
+nn.static_rnn = rnn.static_rnn
+nn.raw_rnn = rnn.raw_rnn
+nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
+nn.rnn_cell = rnn_cell
+
 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
 # documentation, or remove.
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 4c84d78f2e11922e4819e45aaee79374c8c5ec34..5507d011bb0746c84b868ca7efcc3e4f8d2e146a 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1454,7 +1454,10 @@ class BaseSession(SessionInterface):
               self._session._session, self._handle, args, status, None)
 
     def __del__(self):
-      if self._handle is not None:
+      # NOTE(mrry): It is possible that `self._session.__del__()` could be
+      # called before this destructor, in which case `self._session._session`
+      # will be `None`.
+      if self._handle is not None and self._session._session is not None:
         with errors.raise_exception_on_not_ok_status() as status:
           if self._session._created_with_new_api:
             tf_session.TF_SessionReleaseCallable(
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 0791c614fa88700fdf2d0d673e168fc9784731a5..1ad0b9de5e76e3edd66303ab4666108f43a27428 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -624,6 +624,20 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testConstantOutput(self):
+    iterator = (
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, b"hello", 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 8729e085a32f6df87ba9feb515ccfac6a105cfef..406f172e593c1ee89818628b286d31fbd35901b3 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -121,7 +121,7 @@ class Dataset(object):
       An `Iterator` over the elements of this dataset.
 
     Raises:
-      RuntimeError: If eager execution is enabled.
+      RuntimeError: If eager execution is not enabled.
     """
     if context.executing_eagerly():
       return iterator_ops.EagerIterator(self)
@@ -1155,10 +1155,12 @@ class _GeneratorDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._state_classes = sparse.get_classes(ret)
@@ -1167,11 +1169,9 @@ class _GeneratorDataset(Dataset):
       self._state_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._init_func = tf_init_func
@@ -1214,10 +1214,12 @@ class _GeneratorDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._output_classes = sparse.get_classes(ret)
@@ -1226,11 +1228,9 @@ class _GeneratorDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._next_func = tf_next_func
@@ -1816,10 +1816,12 @@ class MapDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       ret = nest.pack_sequence_as(ret, [
           sparse_tensor_lib.SparseTensor.from_value(t)
-          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
       ])
 
       self._output_classes = sparse.get_classes(ret)
@@ -1828,11 +1830,9 @@ class MapDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
-      # Serialize any sparse tensors and convert result to tensors.
-      ret = nest.pack_sequence_as(ret, [
-          ops.convert_to_tensor(t)
-          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
-      ])
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
       return nest.flatten(ret)
 
     self._map_func = tf_map_func
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 6705cd31e291d2eab7aa8179e9b2b829f8970c18..5e4604fda4d7249a1244f12a533e1cb09e16782f 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -31,15 +31,18 @@ from tensorflow.python.training import session_run_hook
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s. Provides a substitute for
+  `tfdbg.LocalCLIDebugWrapperSession` in cases where the session is not directly
+  available.
   """
 
   def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
-      ui_type: (str) user-interface type.
+      ui_type: (`str`) requested user-interface type. Currently supported:
+        (curses | readline).
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
@@ -153,8 +156,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
 class DumpingDebugHook(session_run_hook.SessionRunHook):
   """A debugger hook that dumps debug data to filesystem.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
@@ -229,8 +232,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   When the arguments of debug_utils.watch_graph changes, strongly consider
   changing arguments here too so that features are available to tflearn users.
 
-  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
-  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  Can be used as a hook for `tf.train.MonitoredSession`s and
+  `tf.estimator.Estimator`s.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 8c0d3feeceab1bf29c1dabc668176a6ef7806421..b3268c9047e264b8264ae37b404b51be6a88962f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -142,6 +142,8 @@ cuda_py_test(
         ":tape",
         ":test",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 61859d6be3ae128aaa65fc441d6a1c70e08a044f..5168ad3b18f623588b7804f597fa3b816de147f3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -223,6 +223,16 @@ class HelperContext(object):
     else:
       return val
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def __enter__(self):
     # pylint: disable=protected-access
     self._g = ops.get_default_graph()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 9af197981bde309160781fa5821152962e5383bb..65dde75e607b782cd59f631bfac0f1ccc1e6f323 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,9 +29,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
@@ -104,6 +106,7 @@ class FunctionTest(test.TestCase):
     matmul = function.defun(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
+
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -312,6 +315,7 @@ class FunctionTest(test.TestCase):
         x = variable_scope.get_variable(
             'v', initializer=constant_op.constant(1.0))
         return x * constant_op.constant(2.0)
+
       with self.assertRaisesRegexp(ValueError,
                                    'No trainable variables were accessed'):
         backprop.implicit_val_and_grad(f)()
@@ -581,6 +585,7 @@ class FunctionTest(test.TestCase):
       with ops.name_scope('foo'):
         v = resource_variable_ops.ResourceVariable(0.0, name='bar')
       self.assertEqual(v.name, 'foo/bar:0')
+
     create_variable()
 
   def testVariableNamesRespectNameScopesWithDefunInGraph(self):
@@ -590,9 +595,25 @@ class FunctionTest(test.TestCase):
         with ops.name_scope('foo'):
           v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
         self.assertEqual(v.name, 'foo/bar:0')
+
       with ops.get_default_graph().as_default():
         create_variable()
 
+  def testLayerInDefun(self):
+    conv = convolutional.Conv2D(
+        filters=1,
+        kernel_size=2,
+        kernel_initializer=init_ops.ones_initializer(),
+        bias_initializer=init_ops.zeros_initializer())
+
+    @function.defun
+    def model(x):
+      return conv(x)
+
+    x = array_ops.ones([1, 2, 2, 1])
+    y = model(x)
+    self.assertAllEqual([[[[4.0]]]], y.numpy())
+
 
 class AutomaticControlDependenciesTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 5d8b19223f000862aa46ad3a60796ae68bdec2f9..a34405c702f97150748075dcb27bc470579437b0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -251,6 +251,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
@@ -327,6 +328,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -383,6 +385,7 @@ py_library(
         ":model_fn",
         ":optimizers",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
@@ -466,6 +469,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:metrics",
         "//tensorflow/python:platform",
@@ -743,6 +747,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 500ea03ea7fef9c60b9f36f1d04f1f4c337371e8..0ecc8c7089a8213887dd2211de120662ab0345a4 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -209,8 +209,8 @@ class _CacheTrainingStatesUsingVariables(object):
         name='cache_insert')
 
 
-class StopAtAttemptsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
 
   def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
                max_trees, max_depth):
@@ -224,25 +224,17 @@ class StopAtAttemptsHook(session_run_hook.SessionRunHook):
         [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
 
   def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
     num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
     if (num_finalized_trees >= self._max_trees or
-        1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees):
-      run_context.request_stop()
-
-
-class StopAtNumTreesHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of trees."""
-
-  def __init__(self, num_trees_tensor, max_trees):
-    self._num_trees_tensor = num_trees_tensor
-    self._max_trees = max_trees
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._num_trees_tensor)
-
-  def after_run(self, run_context, run_values):
-    num_trees = run_values.results
-    if num_trees > self._max_trees:
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
       run_context.request_stop()
 
 
@@ -325,27 +317,28 @@ def _bt_model_fn(
                                                    head.logits_dimension)
 
     # Create Ensemble resources.
-    if is_single_machine:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      local_tree_ensemble = tree_ensemble
-      ensemble_reload = control_flow_ops.no_op()
-    else:
-      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-      with ops.device(worker_device):
-        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-            name=name + '_local', is_local=True)
-      # TODO(soroush): Do partial updates if this becomes a bottleneck.
-      ensemble_reload = local_tree_ensemble.deserialize(
-          *tree_ensemble.serialize())
-
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
     # Create logits.
     if mode != model_fn.ModeKeys.TRAIN:
       logits = boosted_trees_ops.predict(
-          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension,
           max_depth=tree_hparams.max_depth)
     else:
+      if is_single_machine:
+        local_tree_ensemble = tree_ensemble
+        ensemble_reload = control_flow_ops.no_op()
+      else:
+        # Have a local copy of ensemble for the distributed setting.
+        with ops.device(worker_device):
+          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+              name=name + '_local', is_local=True)
+        # TODO(soroush): Do partial updates if this becomes a bottleneck.
+        ensemble_reload = local_tree_ensemble.deserialize(
+            *tree_ensemble.serialize())
       if cache:
         cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
       else:
@@ -357,8 +350,8 @@ def _bt_model_fn(
             array_ops.zeros(
                 [batch_size, head.logits_dimension], dtype=dtypes.float32))
       with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = local_tree_ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         last_layer_nodes_range) = local_tree_ensemble.get_states()
         summary.scalar('ensemble/num_trees', num_trees)
         summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
         summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
@@ -401,10 +394,7 @@ def _bt_model_fn(
         (node_ids_per_feature, gains_list, thresholds_list,
          left_node_contribs_list, right_node_contribs_list) = (
              boosted_trees_ops.calculate_best_gains_per_feature(
-                 node_id_range=array_ops.stack([
-                     math_ops.reduce_min(node_ids),
-                     math_ops.reduce_max(node_ids)
-                 ]),
+                 node_id_range=last_layer_nodes_range,
                  stats_summary_list=stats_summary_list,
                  l1=tree_hparams.l1,
                  l2=tree_hparams.l2,
@@ -468,7 +458,8 @@ def _bt_model_fn(
     # Add an early stop hook.
     estimator_spec = estimator_spec._replace(
         training_hooks=estimator_spec.training_hooks +
-        (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),))
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
   return estimator_spec
 
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index 01e5cc7a5d6eb0a26fd47f7d7f9bfb566520b246..7823ef84100578eb0e149364989f6775e94072f0 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -69,7 +69,7 @@ def _make_train_input_fn(is_classification):
   return _input_fn
 
 
-class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._feature_columns = {
@@ -79,10 +79,18 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
         for i in range(NUM_FEATURES)
     }
 
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
 
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
@@ -97,7 +105,8 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
 
@@ -118,29 +127,9 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase):
     est.train(train_input_fn, steps=num_steps)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertEquals(5, len(predictions))
     # All labels are correct.
-    self.assertAllClose([0], predictions[0]['class_ids'])
-    self.assertAllClose([1], predictions[1]['class_ids'])
-    self.assertAllClose([1], predictions[2]['class_ids'])
-    self.assertAllClose([0], predictions[3]['class_ids'])
-    self.assertAllClose([0], predictions[4]['class_ids'])
-
-
-class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, expected_global_step):
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
 
   def testTrainAndEvaluateRegressor(self):
     input_fn = _make_train_input_fn(is_classification=False)
@@ -155,9 +144,10 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 11)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.913176)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
 
   def testInferRegressor(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
@@ -174,16 +164,13 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase):
     num_steps = 100
     # Train for a few steps, and validate final checkpoint.
     est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(est.model_dir, 6)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
 
     predictions = list(est.predict(input_fn=predict_input_fn))
-
-    self.assertEquals(5, len(predictions))
-    self.assertAllClose([0.703549], predictions[0]['predictions'])
-    self.assertAllClose([0.266539], predictions[1]['predictions'])
-    self.assertAllClose([0.256479], predictions[2]['predictions'])
-    self.assertAllClose([1.088732], predictions[3]['predictions'])
-    self.assertAllClose([1.901732], predictions[4]['predictions'])
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
 
 
 class ModelFnTests(test_util.TensorFlowTestCase):
@@ -236,6 +223,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -320,6 +309,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -420,6 +411,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
@@ -457,6 +450,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     second_round = """
@@ -541,6 +536,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
         """
     third_round = """
@@ -641,6 +638,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 3
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
         """
     return (first_round, second_round, third_round)
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index bb033d349534e044b2b92d064051ee5fa07f4d62..5e61c30ea2d4019dd89d9e1c4418ca1bb6a2c9ae 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -57,8 +57,8 @@ _PREDICT_SERVING_KEY = 'predict'
 
 # A LossSpec contains
 # * a scalar `Tensor` representing reduced weighted training loss
-# * a scalar `Tensor` representing the unreduced unweighted loss
-# * a scalar `Tensor` representing the example weights
+# * a `Tensor` representing the unreduced unweighted loss
+# * a `Tensor` representing the example weights
 # * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
 LossSpec = collections.namedtuple(
     'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
@@ -163,8 +163,8 @@ class _Head(object):
     Returns:
       A LossSpec that contains
       * the scalar `Tensor` representing reduced weighted training loss
-      * the scalar `Tensor` representing the unreduced unweighted loss
-      * the scalar `Tensor` representing the example weights
+      * the `Tensor` representing the unreduced unweighted loss
+      * the `Tensor` representing the example weights
       * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
         etc.)
 
@@ -263,9 +263,12 @@ def _check_dense_labels_match_logits_and_reshape(
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
               'Mismatched label shape. '
-              'Classifier configured with n_classes=%s.  Received %s. '
-              'Suggested Fix: check your n_classes argument to the estimator '
-              'and/or the shape of your label.' %
+              'Expected labels dimension=%s.  Received %s. '
+              'Suggested Fix:'
+              'If your classifier expects one-hot encoding label,'
+              'check your n_classes argument to the estimator'
+              'and/or the shape of your label.'
+              'Otherwise, check the shape of your label.' %
               (expected_labels_dimension, dim1))
       expected_labels_shape = array_ops.concat(
           [logits_shape[:-1], [expected_labels_dimension]], axis=0)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4d3eff71ad2167315614c41b70f1127d51b12de3..901f04719f7fc76a68692311d1d7d4e4128bb23c 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -216,7 +216,8 @@ class Estimator(object):
     else:
       self._session_config = self._config.session_config
 
-    self._device_fn = _get_replica_device_setter(self._config)
+    self._device_fn = self._config.device_fn or \
+                      _get_replica_device_setter(self._config)
 
     if model_fn is None:
       raise ValueError('model_fn must be provided to Estimator.')
@@ -637,7 +638,7 @@ class Estimator(object):
         # pylint: disable=protected-access
         local_init_op = (
             estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold._default_local_init_op())
+            monitored_session.Scaffold.default_local_init_op())
         # pylint: enable=protected-access
 
         # Perform the export
@@ -723,7 +724,7 @@ class Estimator(object):
       batch_length = batch_length or value.shape[0]
       if value.shape[0] != batch_length:
         raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length then others.' % key)
+                         'different batch length than others.' % key)
     return batch_length
 
   def _extract_keys(self, predictions, predict_keys):
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index f62c9cece6a4d370532ca3077d679a54f38918f1..8162b249f1f0be6c901f09ff21f9a67f7c5e492f 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,11 +27,13 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
+_VALID_DEVICE_FN_ARGS = set(['op'])
 
 # A list of the property names in RunConfig that the user is allowed to change.
 _DEFAULT_REPLACEABLE_LIST = [
@@ -44,7 +46,8 @@ _DEFAULT_REPLACEABLE_LIST = [
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
     'log_step_count_steps',
-    'train_distribute'
+    'train_distribute',
+    'device_fn'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -279,6 +282,11 @@ def _validate_properties(run_config):
   _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
             message='tf_random_seed must be integer.')
 
+  _validate('device_fn', lambda device_fn: six.callable(device_fn) and
+            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            message='device_fn must be callable with exactly'
+                    ' one argument "op".')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -302,7 +310,8 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                log_step_count_steps=100,
-               train_distribute=None):
+               train_distribute=None,
+               device_fn=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -316,7 +325,7 @@ class RunConfig(object):
     a list of task addresses.
 
     `task` has two attributes: `type` and `index`, where `type` can be any of
-    the task types in `cluster`. ` When `TF_CONFIG` contains said information,
+    the task types in `cluster`. When `TF_CONFIG` contains said information,
     the following properties are set on this class:
 
     * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
@@ -430,6 +439,10 @@ class RunConfig(object):
         `tf.contrib.distribute.DistributionStrategy`. If specified,
         then Estimator will distribute the user's model during training,
         according to the policy specified by that strategy.
+      device_fn: A callable invoked for every `Operation` that takes the
+        `Operation` and returns the device string. If `None`, defaults to
+        the device function returned by `tf.train.replica_device_setter`
+        with round-robin strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -466,7 +479,8 @@ class RunConfig(object):
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps,
-        train_distribute=train_distribute)
+        train_distribute=train_distribute,
+        device_fn=device_fn)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -568,6 +582,16 @@ class RunConfig(object):
   def cluster_spec(self):
     return self._cluster_spec
 
+  @property
+  def device_fn(self):
+    """Returns the device_fn.
+
+    If device_fn is not `None`, it overrides the default
+    device function used in `Estimator`.
+    Otherwise the default one is used.
+    """
+    return self._device_fn
+
   @property
   def evaluation_master(self):
     return self._evaluation_master
@@ -697,7 +721,8 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
-      - `train_distribute`.
+      - `train_distribute`,
+      - `device_fn`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index a3eef4c53fd90a1ce69f3067d0b5c15909f43cec..c8b12605e1aaad11e114e4ace63697b93f3b2b92 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -42,6 +42,7 @@ _SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
 _KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
 _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
 _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".'
 _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
 _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
 _INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
@@ -83,6 +84,7 @@ class RunConfigTest(test.TestCase):
     self.assertEqual(5, config.keep_checkpoint_max)
     self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
     self.assertIsNone(config.service)
+    self.assertIsNone(config.device_fn)
 
   def test_model_dir(self):
     empty_config = run_config_lib.RunConfig()
@@ -93,6 +95,7 @@ class RunConfigTest(test.TestCase):
 
   def test_replace_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig().replace(
         tf_random_seed=11,
@@ -100,13 +103,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_replace_none_value(self):
     config = run_config_lib.RunConfig().replace(
@@ -117,7 +122,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -126,6 +132,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_replace_with_disallowallowed_properties(self):
     config = run_config_lib.RunConfig()
@@ -166,9 +173,12 @@ class RunConfigTest(test.TestCase):
       config.replace(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       config.replace(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      config.replace(device_fn=lambda x, y: 0)
 
   def test_init_with_allowed_properties(self):
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    device_fn = lambda op: "/cpu:0"
 
     config = run_config_lib.RunConfig(
         tf_random_seed=11,
@@ -176,13 +186,15 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_secs=14,
         session_config=session_config,
         keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17)
+        keep_checkpoint_every_n_hours=17,
+        device_fn=device_fn)
     self.assertEqual(11, config.tf_random_seed)
     self.assertEqual(12, config.save_summary_steps)
     self.assertEqual(14, config.save_checkpoints_secs)
     self.assertEqual(session_config, config.session_config)
     self.assertEqual(16, config.keep_checkpoint_max)
     self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+    self.assertEqual(device_fn, config.device_fn)
 
   def test_init_none_value(self):
     config = run_config_lib.RunConfig(
@@ -193,7 +205,8 @@ class RunConfigTest(test.TestCase):
         save_checkpoints_steps=None,
         session_config=None,
         keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None)
+        keep_checkpoint_every_n_hours=None,
+        device_fn=None)
     self.assertIsNone(config.tf_random_seed)
     self.assertIsNone(config.model_dir)
     self.assertIsNone(config.save_summary_steps)
@@ -202,6 +215,7 @@ class RunConfigTest(test.TestCase):
     self.assertIsNone(config.session_config)
     self.assertIsNone(config.keep_checkpoint_max)
     self.assertIsNone(config.keep_checkpoint_every_n_hours)
+    self.assertIsNone(config.device_fn)
 
   def test_init_invalid_values(self):
     with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
@@ -220,6 +234,8 @@ class RunConfigTest(test.TestCase):
       run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0)
     with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
       run_config_lib.RunConfig(tf_random_seed=1.0)
+    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
+      run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0")
 
 
 class RunConfigDistributedSettingTest(test.TestCase):
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index e38b765da52a7b6957a4fb8a02087c5d1fd5a781..9d271758f635869730c03bc2ac853b3493ec0cae 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -137,7 +137,7 @@ class TrainSpec(
           * A tuple (features, labels): Where features is a `Tensor` or a
             dictionary of string feature name to `Tensor` and labels is a
             `Tensor` or a dictionary of string label name to `Tensor`.
-            
+
       max_steps: Int. Positive number of total steps for which to train model.
         If `None`, train forever. The training `input_fn` is not expected to
         generate `OutOfRangeError` or `StopIteration` exceptions. See the
@@ -334,7 +334,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   can read and write). The only extra work to do is setting the environment
   variable `TF_CONFIG` properly for each worker correspondingly.
 
-  Also see: https://www.tensorflow.org/deploy/distributed
+  Also see
+  [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 3a315e5c2ea0d9607b5aa52715364d6bdf152e1c..f9201a4794f78ec94e3901b14c25aca61f932d86 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -581,24 +581,25 @@ class _LinearModel(training.Model):
         **kwargs)
 
   def call(self, features):
-    for column in self._feature_columns:
-      if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
-    weighted_sums = []
-    ordered_columns = []
-    builder = _LazyBuilder(features)
-    for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
-      ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
-      weighted_sum = layer(builder)
-      weighted_sums.append(weighted_sum)
+    with variable_scope.variable_scope(self.name):
+      for column in self._feature_columns:
+        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+          raise ValueError(
+              'Items of feature_columns must be either a '
+              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+      weighted_sums = []
+      ordered_columns = []
+      builder = _LazyBuilder(features)
+      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+        ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+        weighted_sum = layer(builder)
+        weighted_sums.append(weighted_sum)
 
-    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
-    predictions_no_bias = math_ops.add_n(
-        weighted_sums, name='weighted_sum_no_bias')
-    predictions = nn_ops.bias_add(
-        predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
     return predictions
 
   def _add_layers(self, layers):
@@ -3147,6 +3148,9 @@ def _safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != 'sum':
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -3195,13 +3199,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
                        collections.namedtuple('_IndicatorColumn',
                                               ['categorical_column'])):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 07588af37ee92eb2143d20eafa2874d794360fa4..62718db0e5a71e5be8361cd297eb61a78b07a06f 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1511,6 +1511,28 @@ class LinearModelTest(test.TestCase):
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [5010.]], predictions.eval())
 
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      predictions = fc.linear_model(
+          features, [wire_cast_weights], sparse_combiner='sum')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
@@ -6164,14 +6186,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = get_keras_linear_model_predictions({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = get_keras_linear_model_predictions(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
@@ -6255,13 +6279,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = fc.linear_model(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 0edae92fd4a86e7d10a180ce64364d3ea552bf60..eda713641dc2bf62432edf0e787f61079fba7fdc 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -297,6 +297,9 @@ class DType(object):
   def __hash__(self):
     return self._type_enum
 
+  def __reduce__(self):
+    return as_dtype, (self.name,)
+
   @property
   def size(self):
     if (self._type_enum == types_pb2.DT_VARIANT or
@@ -345,7 +348,7 @@ tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint32")
+tf_export("uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
@@ -648,6 +651,10 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
+_PYTHON_TO_TF = {
+    float: float32,
+    bool: bool,
+}
 
 @tf_export("as_dtype")
 def as_dtype(type_value):
@@ -679,6 +686,11 @@ def as_dtype(type_value):
   except KeyError:
     pass
 
+  try:
+    return _PYTHON_TO_TF[type_value]
+  except KeyError:
+    pass
+
   if isinstance(type_value, np.dtype):
     # The numpy dtype for strings is variable length. We can not compare
     # dtype with a single constant (np.string does not exist) to decide
@@ -687,11 +699,13 @@ def as_dtype(type_value):
     if type_value.type == np.string_ or type_value.type == np.unicode_:
       return string
 
-  for key, val in _NP_TO_TF:
-    try:
-      if key == type_value:
-        return val
-    except TypeError as e:
-      raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
+  if isinstance(type_value, (type, np.dtype)):
+    for key, val in _NP_TO_TF:
+      try:
+        if key == type_value:
+          return val
+      except TypeError as e:
+        raise TypeError("Cannot convert {} to a dtype. {}".format(
+            type_value, e))
 
   raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e49e2fda5d84da4f8f87fae73874351afe0a20f2..7c2169b2af10d46ce4a14688aa5ea70fe985a24d 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,19 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonTypesConversion(self):
+    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
+
+  def testReduce(self):
+    for enum in dtypes._TYPE_TO_STRING:
+      dtype = dtypes.DType(enum)
+      ctor, args = dtype.__reduce__()
+      self.assertEquals(ctor, dtypes.as_dtype)
+      self.assertEquals(args, (dtype.name,))
+      reconstructed = ctor(*args)
+      self.assertEquals(reconstructed, dtype)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c05396b06e7824f145f4a9357456ebec4898fca0..d6bc14fbc75199a97f50c4dc120b2704970d1879 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
@@ -1362,7 +1361,7 @@ class UnrollLSTMTest(test.TestCase):
         value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
     new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
         i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
     new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
     return new_m, new_c
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 910364364c8be84b1a629dbdaae5e69443d07e75..394fac6c856197030f85aab5b11fa881eddf670d 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -285,7 +285,7 @@ def convert_variables_to_constants(sess,
     output_graph_def.node.extend([output_node])
 
   output_graph_def.library.CopyFrom(inference_graph.library)
-  print("Converted %d variables to const ops." % how_many_converted)
+  logging.info("Converted %d variables to const ops.", how_many_converted)
   return output_graph_def
 
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2574fa57a43f135f7d5ed17bb73d2b28a7c087a8..662cda2a7d4822d92a7d10ac42012bc2675c5eac 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import copy
+import functools
 import linecache
 import os
 import re
@@ -4179,6 +4180,19 @@ class Graph(object):
     """
     return self._name_stack
 
+  @tf_contextlib.contextmanager
+  def _colocate_with_for_gradient(self, op, gradient_uid,
+                                  ignore_existing=False):
+    with self.colocate_with(op, ignore_existing):
+      if gradient_uid is not None and self._control_flow_context is not None:
+        try:
+          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
+          yield
+        finally:
+          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
+      else:
+        yield
+
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4958,8 +4972,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
+def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       return device(op.device)
@@ -4973,7 +4986,13 @@ def colocate_with(op, ignore_existing=False):
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-    return default_graph.colocate_with(op, ignore_existing)
+    return default_graph._colocate_with_for_gradient(
+        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
+
+
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -5226,14 +5245,35 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      context.context().context_switches.push(default.building_function,
-                                              default.as_default)
+      if context.executing_eagerly():
+        # A Graph alone on the context stack would keep init_scope-wrapped
+        # operations graph building when entered (assuming init_scope is called
+        # in a graph building context). Instead, we push a context which first
+        # enables eager execution and then re-enters the Graph.
+        context.context().context_switches.push(
+            default.building_function,
+            functools.partial(
+                _enter_context_and_graph,
+                context.eager_mode,
+                default.as_default))
+      else:
+        # This Graph is being used from a graph building context. A lack of
+        # context switch implies that the context is graph building.
+        context.context().context_switches.push(default.building_function,
+                                                default.as_default)
       with super(_DefaultGraphStack, self).get_controller(default) as g:
         yield g
     finally:
       context.context().context_switches.pop()
 
 
+@tf_contextlib.contextmanager
+def _enter_context_and_graph(context_fn, graph_fn):
+  """Combines two context managers."""
+  with context_fn(), graph_fn():
+    yield
+
+
 _default_graph_stack = _DefaultGraphStack()
 
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 58bead91ed8dedaa2fa90e9edfcf377e943ef79f..c9c1a3d66be1051859b3dc4eef67803881efcd55 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2305,6 +2305,13 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertEqual(ops.get_name_scope(), "inner")
       self.assertEqual(ops.get_name_scope(), "")
 
+  def testEagerGraphContextsExecuteEagerly(self):
+    with context.eager_mode():
+      with ops.Graph().as_default():
+        with context.graph_mode():
+          with ops.init_scope():
+            self.assertTrue(context.executing_eagerly())
+
   def testPreservesNameScopeInEagerExecution(self):
     with context.eager_mode():
       def foo():
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 26069d9d90e4a75cfe3988628f1407d6f327385b..0dd29460ed93aadf61ef1f1b2dbf1d7802ca4877 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -459,6 +459,9 @@ class Dimension(object):
     else:
       return self._value >= other.value
 
+  def __reduce__(self):
+    return Dimension, (self._value,)
+
 
 def as_dimension(value):
   """Converts the given value to a Dimension.
@@ -931,6 +934,9 @@ class TensorShape(object):
       return True
     return self._dims != other.dims
 
+  def __reduce__(self):
+    return TensorShape, (self._dims,)
+
 
 def as_shape(shape):
   """Converts the given object to a TensorShape."""
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 4f239228332946d9a863be408f5967c282019852..a00e82d470d045b8548e625ef64825247b617968 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -197,6 +197,14 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(nine % 4, 1)
     self.assertEqual(4 % nine, 4)
 
+  def testReduce(self):
+    dim = tensor_shape.Dimension(5)
+    ctor, args = dim.__reduce__()
+    self.assertEquals(ctor, tensor_shape.Dimension)
+    self.assertEquals(args, (5,))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, dim)
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
@@ -422,5 +430,15 @@ class ShapeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([2, None, 4], tensor_shape.TensorShape(
         (2, None, 4)).as_list())
 
+  def testReduce(self):
+    shape = tensor_shape.TensorShape([2, 3])
+    ctor, args = shape.__reduce__()
+    self.assertEquals(ctor, tensor_shape.TensorShape)
+    self.assertEquals(args, ([tensor_shape.Dimension(2),
+                              tensor_shape.Dimension(3)],))
+    reconstructed = ctor(*args)
+    self.assertEquals(reconstructed, shape)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 64b0fa6c00686949066d78c4f54f23205ede8525..8cf24206edab8be807eca1d067662a57585e2bda 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -822,17 +822,32 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   all-or-nothing.
 
   Args:
-    tensor: The rank-1 Tensor to be evaluated.
+    tensor: The rank-0 or rank-1 Tensor to be evaluated.
 
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
+
+  Raises:
+    ValueError: If the shape is rank-0 and is not statically known to be -1.
   """
   if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
+  if tensor.get_shape().ndims == 0:
+    value = constant_value(tensor)
+    if value is None:
+      raise ValueError(
+          "Received a scalar with unknown value as shape; require a statically "
+          "known scalar with value '-1' to describe an unknown shape.")
+    if value != -1:
+      raise ValueError(
+          "Received a scalar value '%s' as shape; require a statically known "
+          "scalar with value '-1' to describe an unknown shape." % value)
+    return tensor_shape.unknown_shape()
+
   shape = tensor.get_shape().with_rank(1)
-  if tensor.get_shape() == [0]:
+  if shape == [0]:
     return tensor_shape.scalar()
   elif tensor.op.type == "Shape":
     return tensor.op.inputs[0].get_shape()
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index bf00fa6439b82234e951598131d2d7ab579fb6c4..77dd76cba39af8fb61ac71837dd887bb90d3233d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -615,45 +615,68 @@ def assert_no_garbage_created(f):
 
 
 def run_in_graph_and_eager_modes(__unused__=None,
-                                 graph=None,
                                  config=None,
-                                 use_gpu=False,
-                                 force_gpu=False,
+                                 use_gpu=True,
                                  reset_test=True,
                                  assert_no_eager_garbage=False):
-  """Runs the test in both graph and eager modes.
+  """Execute the decorated test with and without enabling eager execution.
+
+  This function returns a decorator intended to be applied to test methods in
+  a @{tf.test.TestCase} class. Doing so will cause the contents of the test
+  method to be executed twice - once normally, and once with eager execution
+  enabled. This allows unittests to confirm the equivalence between eager
+  and graph execution (see @{tf.enable_eager_execution}).
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(tf.test.TestCase):
+
+    @run_in_graph_and_eager_modes()
+    def test_foo(self):
+      x = tf.constant([1, 2])
+      y = tf.constant([3, 4])
+      z = tf.add(x, y)
+      self.assertAllEqual([4, 6], self.evaluate(z))
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test validates that `tf.add()` has the same behavior when computed with
+  eager execution enabled as it does when constructing a TensorFlow graph and
+  executing the `z` tensor in a session.
+
 
   Args:
     __unused__: Prevents sliently skipping tests.
-    graph: Optional graph to use during the returned session.
     config: An optional config_pb2.ConfigProto to use to configure the
-      session.
-    use_gpu: If True, attempt to run as many ops as possible on GPU.
-    force_gpu: If True, pin all ops to `/device:GPU:0`.
-    reset_test: If True, tearDown and SetUp the test case again.
+      session when executing graphs.
+    use_gpu: If True, attempt to run as many operations as possible on GPU.
+    reset_test: If True, tearDown and SetUp the test case between the two
+      executions of the test (once with and once without eager execution).
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
-      the test in eager mode. This will fail if there are reference cycles
-      (e.g. a = []; a.append(a)). Off by default because some tests may create
-      garbage for legitimate reasons (e.g. they define a class which inherits
-      from `object`), and because DEBUG_SAVEALL is sticky in some Python
-      interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work). Additionally, checks that
-      nothing still has a reference to Tensors that the test allocated.
+      the test with eager execution enabled. This will fail if there are
+      reference cycles (e.g. a = []; a.append(a)). Off by default because some
+      tests may create garbage for legitimate reasons (e.g. they define a class
+      which inherits from `object`), and because DEBUG_SAVEALL is sticky in some
+      Python interpreters (meaning that tests which rely on objects being
+      collected elsewhere in the unit test file will not work). Additionally,
+      checks that nothing still has a reference to Tensors that the test
+      allocated.
   Returns:
-    Returns a decorator that will run the decorated test function
-        using both a graph and using eager execution.
+    Returns a decorator that will run the decorated test method twice:
+    once by constructing and executing a graph in a session and once with
+    eager execution enabled.
   """
 
   assert not __unused__, "Add () after run_in_graph_and_eager_modes."
 
   def decorator(f):
-    """Test method decorator."""
-
     def decorated(self, **kwargs):
-      """Decorated the test method."""
       with context.graph_mode():
-        with self.test_session(graph, config, use_gpu, force_gpu):
+        with self.test_session(use_gpu=use_gpu):
           f(self, **kwargs)
 
       if reset_test:
@@ -663,27 +686,20 @@ def run_in_graph_and_eager_modes(__unused__=None,
         self._tempdir = None
         self.setUp()
 
-      def run_eager_mode(self, **kwargs):
-        if force_gpu:
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with context.device(gpu_name):
-            f(self)
-        elif use_gpu:
-          # TODO(xpan): Support softplacement and gpu by default when available.
-          f(self, **kwargs)
-        else:
-          with context.device("/device:CPU:0"):
+      def run_eagerly(self, **kwargs):
+        if not use_gpu:
+          with ops.device("/cpu:0"):
             f(self, **kwargs)
+        else:
+          f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_new_tensors(
-            assert_no_garbage_created(run_eager_mode))
+        run_eagerly = assert_no_new_tensors(
+            assert_no_garbage_created(run_eagerly))
 
       with context.eager_mode():
         with ops.Graph().as_default():
-          run_eager_mode(self, **kwargs)
+          run_eagerly(self, **kwargs)
 
     return decorated
 
@@ -974,6 +990,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
@@ -1364,7 +1382,9 @@ class TensorFlowTestCase(googletest.TestCase):
                      " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
-    if a.dtype == np.float32 or a.dtype == np.float64:
+    if (a.dtype in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]):
       same = np.logical_or(same, np.logical_and(np.isnan(a), np.isnan(b)))
     if not np.all(same):
       # Prints more details than np.testing.assert_array_equal.
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index d08b4bf48a3d3cc89fc73f4c97df0574deee871e..472ccbcac7a447926989cfbef27ec1ea9d71e91c 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -31,13 +31,17 @@ __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 VERSION = __version__
 tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
-tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
+tf_export("GIT_VERSION", "__git_version__").export_constant(
+    __name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
+tf_export("COMPILER_VERSION", "__compiler_version__").export_constant(
+    __name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
-tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
+tf_export("CXX11_ABI_FLAG", "__cxx11_abi_flag__").export_constant(
+    __name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
-tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
+tf_export("MONOLITHIC_BUILD", "__monolithic_build__").export_constant(
+    __name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 067c8213d4741936e4c28aaedf4f30639b8cdc41..6816e204075bc37c6958efa5b028417078c36b2b 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -320,7 +320,8 @@ static PyObject* TF_MeasureCosts(
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
+  const int num_measurements = cluster->type() == "virtual" ? 1 : 10;
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), num_measurements, 0);
 
   tensorflow::grappler::Costs costs;
   tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 57f5097639564eabadf848584fd68fb674511bda..024a8cd3d170824c072721b79e261c131029af4d 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -20,7 +20,6 @@ py_library(
     srcs = [
         "__init__.py",
         "_impl/keras/__init__.py",
-        "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
         "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
@@ -32,9 +31,6 @@ py_library(
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
         "_impl/keras/applications/xception.py",
-        "_impl/keras/backend.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
         "_impl/keras/datasets/__init__.py",
         "_impl/keras/datasets/boston_housing.py",
         "_impl/keras/datasets/cifar.py",
@@ -44,49 +40,13 @@ py_library(
         "_impl/keras/datasets/imdb.py",
         "_impl/keras/datasets/mnist.py",
         "_impl/keras/datasets/reuters.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/base_layer.py",
-        "_impl/keras/engine/input_layer.py",
-        "_impl/keras/engine/network.py",
-        "_impl/keras/engine/saving.py",
-        "_impl/keras/engine/sequential.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_arrays.py",
-        "_impl/keras/engine/training_eager.py",
-        "_impl/keras/engine/training_generator.py",
-        "_impl/keras/engine/training_utils.py",
         "_impl/keras/estimator.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
         "_impl/keras/preprocessing/__init__.py",
         "_impl/keras/preprocessing/image.py",
         "_impl/keras/preprocessing/sequence.py",
         "_impl/keras/preprocessing/text.py",
-        "_impl/keras/regularizers.py",
         "_impl/keras/testing_utils.py",
         "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/io_utils.py",
-        "_impl/keras/utils/layer_utils.py",
         "_impl/keras/utils/multi_gpu_utils.py",
         "_impl/keras/utils/np_utils.py",
         "_impl/keras/utils/vis_utils.py",
@@ -136,7 +96,21 @@ py_library(
         ":empty_condition": [],
         "//conditions:default": [],
     }) + [
-        "@six_archive//:six",
+        ":backend",
+        ":engine",
+        ":layers",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "backend",
+    srcs = ["_impl/keras/backend.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -152,8 +126,6 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -168,13 +140,84 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_library(
+    name = "engine",
+    srcs = [
+        "_impl/keras/activations.py",
+        "_impl/keras/callbacks.py",
+        "_impl/keras/constraints.py",
+        "_impl/keras/engine/__init__.py",
+        "_impl/keras/engine/base_layer.py",
+        "_impl/keras/engine/input_layer.py",
+        "_impl/keras/engine/network.py",
+        "_impl/keras/engine/saving.py",
+        "_impl/keras/engine/sequential.py",
+        "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_arrays.py",
+        "_impl/keras/engine/training_eager.py",
+        "_impl/keras/engine/training_generator.py",
+        "_impl/keras/engine/training_utils.py",
+        "_impl/keras/initializers.py",
+        "_impl/keras/losses.py",
+        "_impl/keras/metrics.py",
+        "_impl/keras/models.py",
+        "_impl/keras/optimizers.py",
+        "_impl/keras/regularizers.py",
+        "_impl/keras/utils/data_utils.py",
+        "_impl/keras/utils/io_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = [
+        "_impl/keras/layers/__init__.py",
+        "_impl/keras/layers/advanced_activations.py",
+        "_impl/keras/layers/convolutional.py",
+        "_impl/keras/layers/convolutional_recurrent.py",
+        "_impl/keras/layers/core.py",
+        "_impl/keras/layers/embeddings.py",
+        "_impl/keras/layers/local.py",
+        "_impl/keras/layers/merge.py",
+        "_impl/keras/layers/noise.py",
+        "_impl/keras/layers/normalization.py",
+        "_impl/keras/layers/pooling.py",
+        "_impl/keras/layers/recurrent.py",
+        "_impl/keras/layers/serialization.py",
+        "_impl/keras/layers/wrappers.py",
+        "_impl/keras/utils/conv_utils.py",
+        "_impl/keras/utils/generic_utils.py",
+        "_impl/keras/utils/layer_utils.py",
+        "_impl/keras/utils/tf_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":engine",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -605,10 +648,11 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_windows",
         "noasan",  # times out
         "notsan",
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index b518898ad8fe2fe7b859ec80714d610242d621dc..8def7ec49375c7ce23e8f2a24a4c3615d05ca9bb 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -22,10 +22,8 @@ import six
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.layers.base import Layer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -136,12 +134,6 @@ def get(identifier):
     identifier = str(identifier)
     return deserialize(identifier)
   elif callable(identifier):
-    if isinstance(identifier, Layer):
-      logging.warning(
-          'Do not pass a layer instance (such as {identifier}) as the '
-          'activation argument of another layer. Instead, advanced '
-          'activation layers should be used just like any other '
-          'layer in a model.'.format(identifier=identifier.__class__.__name__))
     return identifier
   else:
     raise ValueError('Could not interpret '
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index f8c6aff4f2b9b26db4104c15d6ca52cccafa9d40..c3a92bea8920cad3297fee3efc50158813e72361 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -237,9 +237,8 @@ def ResNet50(include_top=True,
   else:
     bn_axis = 1
 
-  x = ZeroPadding2D(padding=(3, 3), name='conv1_pad')(img_input)
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='valid', name='conv1')(x)
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 3aac6a9065cfa6189db1a3d3b33648dc980161b6..449410fe082421193d178b768db2ad1eda183b36 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import collections
 import json
 import os
+import weakref
 
 import numpy as np
 
@@ -35,7 +36,6 @@ from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -55,7 +55,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.training import moving_averages
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -263,6 +263,12 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
+
+
 @tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
@@ -283,17 +289,16 @@ def get_uid(prefix=''):
   ```
   """
   graph = ops.get_default_graph()
-  if graph not in tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS:
-    tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(
-        int)
-  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  if graph not in PER_GRAPH_LAYER_NAME_UIDS:
+    PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
-  per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
+  per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
     del per_graph_layer_name_uids[key]
@@ -1276,6 +1281,11 @@ def moving_average_update(x, value, momentum):
   Returns:
       An Operation to update the variable.
   """
+  # `training` is higher-up than the Keras backend in the abstraction hierarchy.
+  # In particular, `training` depends on layers, and thus on Keras.
+  # moving_averages, being low-level ops, should not be part of the training
+  # module.
+  from tensorflow.python.training import moving_averages  # pylint: disable=g-import-not-at-top
   return moving_averages.assign_moving_average(
       x, value, momentum, zero_debias=True)
 
@@ -2750,8 +2760,7 @@ class Function(object):
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
       name: A name to help users identify what this function does.
-      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`,
-        `options`, `run_metadata`
+      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`.
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None,
@@ -2785,19 +2794,76 @@ class Function(object):
     self.fetches = session_kwargs.pop('fetches', [])
     if not isinstance(self.fetches, list):
       self.fetches = [self.fetches]
+    # The main use case of `fetches` being passed to a model is the ability
+    # to run custom updates (since the outputs of fetches are never returned).
+    # This requires us to wrap fetches in `identity` ops.
+    self.fetches = [array_ops.identity(x) for x in self.fetches]
     self.session_kwargs = session_kwargs
 
+    if session_kwargs:
+      raise ValueError('Some keys in session_kwargs are not supported at this '
+                       'time: %s', session_kwargs.keys())
+
+    self._callable_fn = None
+    self._feed_arrays = None
+    self._feed_symbols = None
+    self._symbol_vals = None
+    self._session = None
+
+  def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
+    """Generates a callable that runs the graph.
+
+    Arguments:
+      feed_arrays: List of input tensors to be fed Numpy arrays at runtime.
+      feed_symbols: List of input tensors to be fed symbolic tensors at runtime.
+      symbol_vals: List of symbolic tensors to be fed to `feed_symbols`.
+      session: Session to use to generate the callable.
+
+    Returns:
+      Function that runs the graph according to the above options.
+    """
+    # Prepare callable options.
+    callable_opts = config_pb2.CallableOptions()
+    # Handle external-data feed.
+    for x in feed_arrays:
+      callable_opts.feed.append(x.name)
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        callable_opts.feed.append(key.name)
+    # Handle symbolic feed.
+    for x, y in zip(feed_symbols, symbol_vals):
+      connection = callable_opts.tensor_connection.add()
+      if x.dtype != y.dtype:
+        y = math_ops.cast(y, dtype=x.dtype)
+      from_tensor = ops._as_graph_element(y)
+      if from_tensor is None:
+        from_tensor = y
+      connection.from_tensor = from_tensor.name  # Data tensor
+      connection.to_tensor = x.name  # Placeholder
+    # Handle fetches.
+    for x in self.outputs + self.fetches:
+      callable_opts.fetch.append(x.name)
+    # Handle updates.
+    callable_opts.target.append(self.updates_op.name)
+    # Create callable.
+    callable_fn = session._make_callable_from_options(callable_opts)
+    # Cache parameters corresponding to the generated callable, so that
+    # we can detect future mismatches and refresh the callable.
+    self._callable_fn = callable_fn
+    self._feed_arrays = feed_arrays
+    self._feed_symbols = feed_symbols
+    self._symbol_vals = symbol_vals
+    self._session = session
+
   def __call__(self, inputs):
     if not isinstance(inputs, (list, tuple)):
       raise TypeError('`inputs` should be a list or tuple.')
 
-    if self.feed_dict:
-      feed_dict = self.feed_dict.copy()
-    else:
-      feed_dict = {}
-
     session = get_session()
-    data_tensors_to_feed = []
+    feed_arrays = []
+    array_vals = []
+    feed_symbols = []
+    symbol_vals = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
@@ -2806,23 +2872,31 @@ class Function(object):
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      elif tensor_util.is_tensor(value):
-        data_tensors_to_feed.append((tensor, value))
+      if tensor_util.is_tensor(value):
+        # Case: feeding symbolic tensor.
+        feed_symbols.append(tensor)
+        symbol_vals.append(value)
       else:
-        feed_dict[tensor] = value
+        # Case: feeding Numpy array.
+        feed_arrays.append(tensor)
+        # We need to do array conversion and type casting at this level, since
+        # `callable_fn` only supports exact matches.
+        array_vals.append(np.asarray(value, dtype=tensor.dtype.base_dtype.name))
+    if self.feed_dict:
+      for key in sorted(self.feed_dict.keys()):
+        array_vals.append(
+            np.asarray(self.feed_dict[key], dtype=key.dtype.base_dtype.name))
 
-    if data_tensors_to_feed:
-      # This is a *temporary* workaround (i.e. hack) to feed a symbolic tensor
-      # to `feed_dict`. It is very inefficient. It will be removed as soon
-      # as it becomes possible to pass symbolic tensors to `feed_dict`.
-      data_tensor_values = session.run([x[1] for x in data_tensors_to_feed])
-      for i, v in enumerate(data_tensor_values):
-        feed_dict[data_tensors_to_feed[i][0]] = v
+    # Refresh callable if anything has changed.
+    if (self._callable_fn is None or
+        feed_arrays != self._feed_arrays or
+        symbol_vals != self._symbol_vals or
+        feed_symbols != self._feed_symbols or
+        session != self._session):
+      self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
 
-    fetches = self.outputs + [self.updates_op] + self.fetches
-    updated = session.run(
-        fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
-    return updated[:len(self.outputs)]
+    fetched = self._callable_fn(*array_vals)
+    return fetched[:len(self.outputs)]
 
 
 @tf_export('keras.backend.function')
@@ -3374,7 +3448,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.softmax_cross_entropy_with_logits_v2
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
@@ -3438,7 +3512,7 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sigmoid_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # transform back to logits
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index fb4b2a0e1dc06c904d4b93038840dbf688d42ed4..de1ed467a2764a2b08269181bb8bcd615868d3b1 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -189,6 +189,39 @@ class BackendUtilsTest(test.TestCase):
     for y in ys:
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
+  def test_function_tf_feed_symbols(self):
+    with self.test_session():
+      # Test feeding a resource variable to `function`.
+      x1 = keras.backend.placeholder(shape=())
+      x2 = keras.backend.placeholder(shape=())
+      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
+
+      y1 = keras.backend.variable(10.)
+      y2 = 3
+
+      f = keras.backend.function(
+          inputs=[x1, x2, lr],
+          outputs=[x1 + 1,
+                   keras.backend.in_train_phase(x2 + 2, x2 - 1)])
+      outs = f([y1, y2, None])  # Use default learning_phase value.
+      self.assertEqual(outs, [11., 2.])
+      outs = f([y1, y2, 1])  # Set learning phase value.
+      self.assertEqual(outs, [11., 5.])
+
+      # Test triggering a callable refresh by changing the input.
+      y3 = keras.backend.constant(20.)  # Test with tensor
+      outs = f([y3, y2, None])
+      self.assertEqual(outs, [21., 2.])
+
+      y4 = 4  # Test with non-symbol
+      outs = f([y4, y2, None])
+      self.assertEqual(outs, [5., 2.])
+
+      # Test with a different dtype
+      y5 = keras.backend.constant(10., dtype='float64')
+      outs = f([y5, y2, None])
+      self.assertEqual(outs, [11., 2.])
+
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
@@ -206,8 +239,9 @@ class BackendUtilsTest(test.TestCase):
                                  updates=[(x, x_placeholder + 1.)],
                                  fetches=[keras.backend.update(y, 5.)])
       output = f([10., 20.])
-      assert output == [30.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [11., 5.]
+      self.assertEqual(output, [30.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [11., 5.])
 
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.Session().run()` via its
@@ -229,14 +263,16 @@ class BackendUtilsTest(test.TestCase):
                                  feed_dict=feed_dict,
                                  fetches=fetches)
       output = f([10.])
-      assert output == [11.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [20., 30.]
+      self.assertEqual(output, [11.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [20., 30.])
 
       # updated value in feed_dict will be modified within the K.function()
       feed_dict[y_placeholder] = 4.
       output = f([20.])
-      assert output == [21.]
-      assert keras.backend.get_session().run(fetches=[x, y]) == [30., 40.]
+      self.assertEqual(output, [21.])
+      self.assertEqual(
+          keras.backend.get_session().run(fetches=[x, y]), [30., 40.])
 
 
 class BackendVariableTest(test.TestCase):
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 755607aafb9343f0c9f10c5f9394bedc2f8afd76..6c68d251275cad4a6f2d6a5bf959a42c041537f3 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -13,143 +13,145 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Base layer code (`Layer`).
-"""
+"""Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+import re
 
+import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-# pylint: disable=invalid-name
-InputSpec = tf_base_layers.InputSpec
-Node = tf_base_layers.Node
-TFBaseLayer = tf_base_layers.Layer
-# pylint: enable=invalid-name
+@tf_export('keras.layers.Layer')
+class Layer(checkpointable.CheckpointableBase):
+  """Base layer class.
 
+  This is the class from which all layers inherit.
 
-@tf_export('keras.layers.Layer')
-class Layer(tf_base_layers.Layer):
-  """Abstract base layer class.
-
-  # Properties
-      name: String, must be unique within a model.
-      input_spec: List of InputSpec class instances
-          each entry describes one required input:
-              - ndim
-              - dtype
-          A layer with `n` input tensors must have
-          an `input_spec` of length `n`.
-      trainable: Boolean, whether the layer weights
-          will be updated during training.
-      uses_learning_phase: Whether any operation
-          of the layer uses `K.in_training_phase()`
-          or `K.in_test_phase()`.
-      input_shape: Shape tuple. Provided for convenience,
-          but note that there may be cases in which this
-          attribute is ill-defined (e.g. a shared layer
-          with multiple input shapes), in which case
-          requesting `input_shape` will raise an Exception.
-          Prefer using `layer.get_input_shape_for(input_shape)`,
-          or `layer.get_input_shape_at(node_index)`.
-      output_shape: Shape tuple. See above.
-      inbound_nodes: List of nodes.
-      outbound_nodes: List of nodes.
-      input, output: Input/output tensor(s). Note that if the layer is used
-          more than once (shared layer), this is ill-defined
-          and will raise an exception. In such cases, use
-          `layer.get_input_at(node_index)`.
-      input_mask, output_mask: Same as above, for masks.
-      trainable_weights: List of variables.
-      non_trainable_weights: List of variables.
-      weights: The concatenation of the lists trainable_weights and
-          non_trainable_weights (in this order).
-
-  # Methods
-      call(x, mask=None): Where the layer's logic lives.
-      __call__(x, mask=None): Wrapper around the layer logic (`call`).
-          If x is a Keras tensor:
-              - Connect current layer with last layer from tensor:
-                  `self._add_inbound_node(last_layer)`
-              - Add layer to tensor history
-          If layer is not built:
-              - Build from inputs shape
-      get_weights()
-      set_weights(weights)
-      get_config()
-      count_params()
-      compute_output_shape(input_shape)
-      compute_mask(x, mask)
-      get_input_at(node_index)
-      get_output_at(node_index)
-      get_input_shape_at(node_index)
-      get_output_shape_at(node_index)
-      get_input_mask_at(node_index)
-      get_output_mask_at(node_index)
-
-  # Class Methods
-      from_config(config)
-
-  # Internal methods:
-      build(input_shape)
-      _add_inbound_node(layer, index=0)
+  A layer is a class implementing common neural networks operations, such
+  as convolution, batch norm, etc. These operations require managing weights,
+  losses, updates, and inter-layer connectivity.
+
+  Users will just instantiate a layer and then treat it as a callable.
+
+  We recommend that descendants of `Layer` implement the following methods:
+  * `__init__()`: Save configuration in member variables
+  * `build()`: Called once from `__call__`, when we know the shapes of inputs
+    and `dtype`. Should have the calls to `add_weight()`, and then
+    call the super's `build()` (which sets `self.built = True`, which is
+    nice in case the user wants to call `build()` manually before the
+    first `__call__`).
+  * `call()`: Called in `__call__` after making sure `build()` has been called
+    once. Should actually perform the logic of applying the layer to the
+    input tensors (which should be passed in as the first argument).
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+
+  Read-only properties:
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
+      non-trainable.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+
+  Mutable properties:
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'activity_regularizer',
         'input_shape',
         'batch_input_shape',
         'batch_size',
-        'dtype',
-        'name',
-        'trainable',
         'weights',
+        'activity_regularizer',
     }
     # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    # Get layer name.
-    name = kwargs.get('name')
-
-    # Get `trainable` status.
-    trainable = kwargs.get('trainable', True)
-
-    # Get `dtype`.
-    dtype = kwargs.get('dtype')
-    if dtype is None:
-      dtype = K.floatx()
-
-    # Call super, which will set all properties common to Keras layers
-    # and core TF layers.
-    super(Layer, self).__init__(
-        name=name, dtype=dtype, trainable=trainable,
-        activity_regularizer=kwargs.get('activity_regularizer'))
+    # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
+    self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
+    self.built = False
+    # Provides information about which inputs are compatible with the layer.
+    self.input_spec = None
+
+    self._init_set_name(name)
+
+    activity_regularizer = kwargs.pop('activity_regularizer', None)
+    if activity_regularizer and context.executing_eagerly():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
+    self._activity_regularizer = activity_regularizer
+    self._trainable_weights = []
+    self._non_trainable_weights = []
+    self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
+    self._losses = []
+    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
+    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._compute_previous_mask = ('mask' in self._call_fn_args or
+                                   hasattr(self, 'compute_mask'))
     self._uses_inputs_arg = True
 
-    # Add properties that are Keras-only for now.
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self._inbound_nodes = []
+    self._outbound_nodes = []
+
     self.supports_masking = False
 
     # Manage input shape information if passed.
@@ -172,39 +174,418 @@ class Layer(tf_base_layers.Layer):
     else:
       self._initial_weights = None
 
-  def add_weight(self,
-                 name,
-                 shape,
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = unique_layer_name(
+          to_snake_case(self.__class__.__name__), zero_based=zero_based)
+    else:
+      self._name = name
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = regularizer
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def updates(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
+
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
+    """
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
+
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    if context.executing_eagerly():
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
+
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
+      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
+    losses = generic_utils.to_list(losses)
+    self._losses += losses
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
+    else:
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
+
+  def _name_scope(self):
+    return self.name
+
+  def build(self, _):
+    """Creates the variables of the layer."""
+    self.built = True
+
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  def add_weight(self, name, shape,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
                  trainable=True,
-                 constraint=None):
-    """Adds a weight variable to the layer.
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 getter=None):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
-        name: String, the name for the weight variable.
-        shape: The shape tuple of the weight.
-        dtype: The dtype of the weight.
-        initializer: An Initializer instance (callable).
-        regularizer: An optional Regularizer instance.
-        trainable: A boolean, whether the weight should
-            be trained via backprop or not (assuming
-            that the layer itself is also trainable).
-        constraint: An optional Constraint instance.
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      getter: Variable getter argument to be passed to the `Checkpointable` API.
 
     Returns:
-        The created weight variable.
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
+
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
+      ValueError: When giving unsupported dtype and no initializer.
     """
     if dtype is None:
-      dtype = K.floatx()
-    weight = self.add_variable(name, shape,
-                               dtype=dtype,
-                               initializer=initializers.get(initializer),
-                               regularizer=regularizers.get(regularizer),
-                               constraint=constraints.get(constraint),
-                               trainable=trainable)
-    return weight
+      dtype = self.dtype or backend.floatx()
+    else:
+      dtype = dtypes.as_dtype(dtype)
+    initializer = initializers.get(initializer)
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
+
+    # Initialize variable when no initializer provided
+    if initializer is None:
+      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+      if dtype.is_floating:
+        initializer = initializers.glorot_uniform()
+      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+      # If dtype is DT_BOOL, provide a default value `FALSE`
+      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+        initializer = initializers.zeros()
+      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+      else:
+        raise ValueError('An initializer for variable %s of type %s is required'
+                         ' for layer %s' % (name, dtype.base_dtype, self.name))
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtypes.as_dtype(dtype),
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource)
+
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
+
+    if trainable:
+      self._trainable_weights.append(variable)
+    else:
+      self._non_trainable_weights.append(variable)
+    return variable
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if not context.executing_eagerly():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if not context.executing_eagerly():
+            init_graph = ops.get_default_graph()
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+
+    if init_graph is not None:  # pylint: disable=protected-access
+      # The variable was created and initialized in a graph.
+      if regularizer:
+        if isinstance(variable, tf_variables.PartitionedVariable):
+          for v in variable:
+            with ops.colocate_with(v.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(v)
+            if regularization is not None:
+              self.add_loss(regularization)
+        else:
+          with ops.colocate_with(variable.op):
+            with ops.name_scope(name + '/Regularizer'):
+              regularization = regularizer(variable)
+          if regularization is not None:
+            self.add_loss(regularization)
+    elif regularizer:  # initialization took place in an eager context
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        raise RuntimeError(
+            'Partitioned variable regularization is not yet '
+            'supported when executing eagerly. File a feature request'
+            'if this is important to you.')
+      # Save a zero-argument lambda which runs the regularizer on the
+      # variable, to be executed when `Layer.losses` is requested.
+      # This makes losses responsive to variable updates when executing
+      # eagerly.
+      #
+      # TODO(akshayka): Do the same for graphs as well, so that losses
+      # collected in a while_loop can be run outside its control flow
+      # context and so that losses won't be swallowed up by graph functions
+      # (i.e., `.losses()` should always create regularizers).
+      self._losses.append(lambda: regularizer(variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      for output in output_list:
+        with ops.name_scope('ActivityRegularizer'):
+          activity_regularization = self._activity_regularizer(output)
+        self.add_loss(activity_regularization, inputs=inputs)
 
   def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
@@ -218,6 +599,215 @@ class Layer(tf_base_layers.Layer):
     """
     return inputs
 
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
+
+    Arguments:
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
+
+    Raises:
+      ValueError: if the layer's `call` method returns None (an invalid value).
+    """
+    input_list = nest.flatten(inputs)
+
+    build_graph = not context.executing_eagerly()
+    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
+    # which don't use an "inputs" argument.
+    in_deferred_mode = isinstance(input_list[0], DeferredTensor)
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
+      previous_mask = collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = estimator_util.fn_args(self.call)
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        if not build_graph:
+          # Activity regularization is currently unsupported in Eager mode.
+          if self._activity_regularizer:
+            raise ValueError(
+                'activity_regularizer currently unsupported with '
+                'eager execution enabled. Found an activity_regularizer in '
+                '%s(%s).' % (self.__class__.__name__, self))
+        if not build_graph and not in_deferred_mode:
+          for x in input_list:
+            if hasattr(x, '_keras_history'):
+              raise ValueError('_keras_history currently unsupported in '
+                               'Eager mode. Found _keras_history in %s while '
+                               'executing __call__ for %s(%s)' %
+                               (x, self.__class_.__name__, self))
+
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if input_list and self._dtype is None:
+          try:
+            self._dtype = input_list[0].dtype.base_dtype.name
+          except AttributeError:
+            pass
+        if all(hasattr(x, 'get_shape') for x in input_list):
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+        self.build(input_shapes)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph or in_deferred_mode:
+        self._assert_input_compatibility(inputs)
+
+      if not in_deferred_mode:
+        outputs = self.call(inputs, *args, **kwargs)
+        if outputs is None:
+          raise ValueError('A layer\'s `call` method should return a Tensor '
+                           'or a list of Tensors, not None (layer: ' +
+                           self.name + ').')
+      else:
+        # Deferred mode behavior: use `compute_output_shape` to
+        # infer the number of outputs of the layer and their shapes.
+        if input_shapes is None:
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
+        output_shapes = self.compute_output_shape(input_shapes)
+        output_shapes = nest.flatten(output_shapes)
+        outputs = [
+            # TODO(fchollet): name the deferred tensors?
+            DeferredTensor(shape=shape, dtype=self._dtype)
+            for shape in output_shapes
+        ]
+        if len(outputs) == 1:
+          outputs = outputs[0]
+
+      if build_graph:
+        self._handle_activity_regularization(inputs, outputs)
+        # TODO(fchollet): consider enabling masking for Eager mode.
+        self._set_mask_metadata(inputs, outputs, previous_mask)
+
+      if in_deferred_mode or build_graph and have_all_keras_metadata(inputs):
+        inputs, outputs = self._set_connectivity_metadata_(
+            inputs, outputs, args, kwargs)
+
+      self.built = True
+      if context.executing_eagerly():
+        return outputs
+
+      if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
+        # Subclassed network: explicitly set metadata normally set by a call to
+        # self._set_inputs(). This is not relevant in eager execution.
+        self._symbolic_set_inputs(inputs, outputs)
+
+      if in_deferred_mode or build_graph:
+        self._set_learning_phase_metadata(inputs, outputs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    # TODO(fchollet): consider enabling this with eager execution too.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
+    return outputs
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This simply wraps `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  def _set_learning_phase_metadata(self, inputs, outputs):
+    # Update learning phase info. To work with subclassed models,
+    # this should be done even if Keras metadata is absent.
+    output_tensors = generic_utils.to_list(outputs)
+    uses_lp = any(
+        [getattr(x, '_uses_learning_phase', False)
+         for x in generic_utils.to_list(inputs)])
+    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
+    for i in range(len(output_tensors)):
+      try:
+        output_tensors[i]._uses_learning_phase = getattr(
+            output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      except AttributeError:
+        # An output element happens to be a C type (such as tuple or dict).
+        # We don't track learning phase info in such edge cases.
+        pass
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    if hasattr(self, 'compute_mask'):
+      output_mask = self.compute_mask(inputs, previous_mask)
+      if isinstance(outputs, (list, tuple)):
+        if output_mask is None:
+          output_mask = [None for _ in range(len(outputs))]
+        for x, m in zip(outputs, output_mask):
+          try:
+            x._keras_mask = m  # pylint: disable=protected-access
+          except AttributeError:
+            pass  # C type such as dict. Masking not supported in this case.
+      else:
+        try:
+          outputs._keras_mask = output_mask  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    if args and getattr(self, '_uses_inputs_arg', True):
+      raise TypeError(
+          'This Layer takes an `inputs` argument to call(), and only the '
+          '`inputs` argument may be specified as a positional argument. '
+          'Pass everything else as a keyword argument (those arguments will'
+          ' not be tracked as inputs to the Layer).')
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
+
   def _inputs_from_call_args(self, call_args, call_kwargs):
     """Get Layer inputs from __call__ *args and **kwargs.
 
@@ -282,71 +872,6 @@ class Layer(tf_base_layers.Layer):
         input_arg_values.extend(bound_args[call_arg_spec.varargs])
       return input_arg_values, non_input_arg_values
 
-  def __call__(self, inputs, *args, **kwargs):
-    """Wrapper around self.call(), for handling internal references.
-
-    If a Keras tensor is passed:
-        - We call self._add_inbound_node().
-        - If necessary, we `build` the layer to match
-            the shape of the input(s).
-        - We update the _keras_history of the output tensor(s)
-            with the current layer.
-            This is done as part of _add_inbound_node().
-
-    Arguments:
-        inputs: Can be a tensor or list/tuple of tensors.
-        *args: Additional positional arguments to be passed to `call()`. Only
-          allowed in subclassed Models with custom call() signatures. In other
-          cases, `Layer` inputs must be passed using the `inputs` argument and
-          non-inputs must be keyword arguments.
-        **kwargs: Additional keyword arguments to be passed to `call()`.
-
-    Returns:
-        Output of the layer's `call` method.
-
-    Raises:
-        ValueError: in case the layer is missing shape information
-            for its `build` call.
-        TypeError: If positional arguments are passed and this `Layer` is not a
-            subclassed `Model`.
-    """
-    # Actually call the layer (optionally building it).
-    output = super(Layer, self).__call__(inputs, *args, **kwargs)
-
-    if args and getattr(self, '_uses_inputs_arg', True):
-      raise TypeError(
-          'This Layer takes an `inputs` argument to call(), and only the '
-          '`inputs` argument may be specified as a positional argument. Pass '
-          'everything else as a keyword argument (those arguments will not be '
-          'tracked as inputs to the Layer).')
-
-    if context.executing_eagerly():
-      return output
-
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
-
-    if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
-      # Subclassed network: explicitly set metadata normally set by a call to
-      # self._set_inputs().
-      self._symbolic_set_inputs(inputs, output)
-
-    # Update learning phase info.
-    output_tensors = generic_utils.to_list(output)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False)
-         for x in generic_utils.to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      output_tensors[i]._uses_learning_phase = getattr(
-          output_tensors[i], '_uses_learning_phase', False) or uses_lp
-
-    # Optionally load weight values that were specified at layer instantiation.
-    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      del self._initial_weights
-    return output
-
   def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer.
 
@@ -362,13 +887,7 @@ class Layer(tf_base_layers.Layer):
     Returns:
         An input shape tuple.
     """
-    logging.warning(
-        'All custom layers should implement the '
-        '`compute_output_shape` method. This layer (' + self.name + ') '
-        'is relying on the base `Layer.compute_output_shape` implementation, '
-        'which will start raising a `NotImplementedError` '
-        'as of July 1st, 2018.')
-    return input_shape
+    raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
@@ -396,6 +915,87 @@ class Layer(tf_base_layers.Layer):
     # carry over the input mask
     return mask
 
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
+
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
 
@@ -476,6 +1076,325 @@ class Layer(tf_base_layers.Layer):
     else:
       return getattr(output, '_keras_mask', None)
 
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+      AttributeError: If no inbound nodes are found.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one output,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+      Output tensor or list of output tensors.
+
+    Raises:
+      AttributeError: if the layer is connected to more than one incoming
+        layers.
+      RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer, or if all inputs
+    have the same shape.
+
+    Returns:
+        Input shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined input_shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self._inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self._inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        ValueError: if the layer isn't yet built
+          (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise ValueError('You tried to call `count_params` on ' + self.name +
+                         ', but the layer isn\'t built. '
+                         'You can build it manually via: `' + self.name +
+                         '.build(batch_input_shape)`.')
+    weight_shapes = [w.get_shape().as_list() for w in self.weights]
+    return int(sum([np.prod(w) for w in weight_shapes]))
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one output,
+    or if all outputs have the same shape.
+
+    Returns:
+        Output shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined output shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self._inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self._inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "%s"'
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.' % self.name)
+
+  @property
+  def inbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._inbound_nodes
+
+  @property
+  def outbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._outbound_nodes
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = nest.flatten(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = nest.flatten(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, but the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                           str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -500,14 +1419,14 @@ class Layer(tf_base_layers.Layer):
     if not params:
       return
     weight_value_tuples = []
-    param_values = K.batch_get_value(params)
+    param_values = backend.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
         raise ValueError('Layer weight shape ' + str(pv.shape) +
                          ' not compatible with '
                          'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
-    K.batch_set_value(weight_value_tuples)
+    backend.batch_set_value(weight_value_tuples)
 
   def get_weights(self):
     """Returns the current weights of the layer.
@@ -516,7 +1435,7 @@ class Layer(tf_base_layers.Layer):
         Weights values as a list of numpy arrays.
     """
     params = self.weights
-    return K.batch_get_value(params)
+    return backend.batch_get_value(params)
 
   def get_config(self):
     """Returns the config of the layer.
@@ -558,9 +1477,196 @@ class Layer(tf_base_layers.Layer):
     """
     return cls(**config)
 
-  @tf_base_layers.Layer.activity_regularizer.setter
-  def activity_regularizer(self, activity_regularizer):
-    self._activity_regularizer = activity_regularizer
+
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer._inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer._outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+    - A._outbound_nodes
+    - B._inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               arguments=None):
+    # Layer instance (NOT a list).
+    if isinstance(outbound_layer, list):
+      raise ValueError(
+          '`outbound_layer` should be a layer instance, not a list.')
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [static_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [static_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        # For compatibility with external Keras, we use the deprecated
+        # accessor here.
+        layer.outbound_nodes.append(self)
+    # For compatibility with external Keras, we use the deprecated
+    # accessor here.
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer': self.outbound_layer.name,
+        'inbound_layers': inbound_names,
+        'node_indices': self.node_indices,
+        'tensor_indices': self.tensor_indices
+    }
+
+
+class DeferredTensor(object):
+  """Tensor-like object used to build graphs of layers in Eager mode.
+
+  When calling a layer on a DeferredTensor, the layer will not perform any
+  computation and will simply perfom shape inference to return new
+  DeferredTensors with appropriate shape information. Thus DeferredTensor
+  behaves like a graph-mode Tensor when manipulated by layers.
+  """
+
+  def __init__(self, shape, dtype, name=None):
+    self.shape = tensor_shape.TensorShape(shape)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
+    self.name = name
+
+  def get_shape(self):
+    return self.shape
+
+  def __str__(self):
+    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
+                                                         self.get_shape(),
+                                                         self.dtype.name)
+
+  def __repr__(self):
+    return "<DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
+                                                        self.get_shape(),
+                                                        self.dtype.name)
 
 
 def shape_type_conversion(fn):
@@ -589,3 +1695,251 @@ def shape_type_conversion(fn):
       return tensor_shape.TensorShape(output_shape)
 
   return wrapper
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def static_shape(x):
+  """Get the static shape of a Tensor, or None if it is unavailable."""
+  if x is None:
+    return None
+  try:
+    return tuple(x.get_shape().as_list())
+  except ValueError:
+    return None
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors/ops reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    if isinstance(x, ops.Operation):
+      outputs = x.outputs[:] or []
+      outputs += x._control_outputs
+    elif isinstance(x, ops.Tensor):
+      outputs = x.consumers()
+    elif isinstance(x, tf_variables.Variable):
+      outputs = [x.op]
+    else:
+      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  # We cannot use Python's `any` because the iterable may return Tensors.
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  return all([hasattr(x, '_keras_history') for x in iterable])
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=True,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable.
+    caching_device: Passed to `vs.variable`.
+    validate_shape: Passed to `vs.variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  v = vs.variable(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource)
+  return v
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
index b51dd8a2189d0c8542c84dfeac9be0d72b96ff1b..bd9dcbe3c576851123dfcabe3e36379019627ac5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
@@ -23,7 +23,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -95,7 +94,7 @@ class InputLayer(base_layer.Layer):
 
       if context.executing_eagerly():
         # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = tf_base_layers._DeferredTensor(  # pylint: disable=protected-access
+        input_tensor = base_layer.DeferredTensor(  # pylint: disable=protected-access
             shape=batch_input_shape,
             dtype=dtype,
             name=self.name)
@@ -123,7 +122,7 @@ class InputLayer(base_layer.Layer):
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-    tf_base_layers.Node(
+    base_layer.Node(
         self,
         inbound_layers=[],
         node_indices=[],
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index 9f1c7de1157a3659ccb27e4850e99e09016d0067..cc177c14a894040df37f75bbdc6b2651336fe869 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -35,8 +35,6 @@ from tensorflow.python.keras._impl.keras.engine import saving
 from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
@@ -82,7 +80,7 @@ class Network(base_layer.Layer):
     # self.losses
     # self.updates
 
-    self._init_set_name(name)
+    self._init_set_name(name, zero_based=True)
     self._activity_regularizer = None
     # This acts just like the `trainable` attribute of any layer instance.
     # It does not affect users of the underlying layers, only users of the
@@ -132,14 +130,14 @@ class Network(base_layer.Layer):
     if context.executing_eagerly():
       # Check that all inputs/outputs are DeferredTensors.
       for tensor in self.inputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'inputs must come from a call to '
                           '`tf.keras.Input` (called after '
                           'tfe.enable_eager_execution()). '
                           'Received invalid input: ' + str(tensor))
       for tensor in self.outputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'outputs must come from a call to '
                           'a layer (called after '
@@ -230,7 +228,7 @@ class Network(base_layer.Layer):
     self._layers_by_depth = layers_by_depth
 
     # Create the node linking internal inputs to internal outputs.
-    tf_base_layers.Node(
+    base_layer.Node(
         outbound_layer=self,
         inbound_layers=[],
         node_indices=[],
@@ -243,8 +241,8 @@ class Network(base_layer.Layer):
     for x in self.inputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
       masks.append(mask)
-    mask_cache_key = (tf_layers_util.object_list_uid(self.inputs) + '_' +
-                      tf_layers_util.object_list_uid(masks))
+    mask_cache_key = (base_layer.object_list_uid(self.inputs) + '_' +
+                      base_layer.object_list_uid(masks))
     masks = []
     for x in self.outputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
@@ -289,7 +287,7 @@ class Network(base_layer.Layer):
     self.built = False
 
   def __setattr__(self, name, value):
-    if isinstance(value, (tf_base_layers.Layer, Network)):
+    if isinstance(value, (base_layer.Layer, Network)):
       try:
         is_graph_network = self._is_graph_network
       except AttributeError:
@@ -299,6 +297,10 @@ class Network(base_layer.Layer):
       if not is_graph_network:
         if value not in self._layers:
           self._layers.append(value)
+          if hasattr(value, '_use_resource_variables'):
+            # In subclassed models, legacy layers (tf.layers) must always use
+            # resource variables.
+            value._use_resource_variables = True
     if isinstance(value, checkpointable.CheckpointableBase):
       # Layer (and therefore Network/Model) inherit from CheckpointableBase
       # rather than Checkpointable, which means there is no Checkpointable
@@ -387,8 +389,8 @@ class Network(base_layer.Layer):
       masks = [None for _ in range(len(inputs))]
     else:
       masks = generic_utils.to_list(mask)
-    cache_key = (tf_layers_util.object_list_uid(inputs)
-                 + '_' + tf_layers_util.object_list_uid(masks))
+    cache_key = (base_layer.object_list_uid(inputs)
+                 + '_' + base_layer.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
@@ -502,8 +504,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         updates)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
     unconditional_updates = [
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
@@ -540,8 +541,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         losses)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, losses)
     relevant_conditional_losses = [x for x in losses if x in reachable]
     unconditional_losses = [
         x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
@@ -623,8 +623,8 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Try to retrieve cached outputs if the layer has already been called
       # on these exact inputs.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       if cache_key in self._output_tensor_cache:
         # Cache hit.
         return self._output_tensor_cache[cache_key]
@@ -656,7 +656,7 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = tf_layers_util.object_list_uid(input_shapes)
+    cache_key = base_layer.object_list_uid(input_shapes)
     if cache_key not in self._output_shape_cache:
       # Cache miss. We have to run the network graph manually (recursive calls
       # to `compute_output_shape`).
@@ -845,7 +845,7 @@ class Network(base_layer.Layer):
     for x in self.outputs:
       assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
       tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(tf_layers_util.static_shape(x))
+      output_shapes.append(base_layer.static_shape(x))
       output_tensors.append(tensor)
       output_masks.append(mask)
 
@@ -859,14 +859,14 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Update cache;
       # keys are based on ids on input tensors and inputs masks.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
       self._output_mask_cache[cache_key] = output_masks
 
       if output_shapes is not None:
-        input_shapes = [tf_layers_util.static_shape(x) for x in inputs]
-        cache_key = tf_layers_util.object_list_uid(input_shapes)
+        input_shapes = [base_layer.static_shape(x) for x in inputs]
+        cache_key = base_layer.object_list_uid(input_shapes)
         self._output_shape_cache[cache_key] = output_shapes
 
     return output_tensors, output_masks
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index dde090120456f968267e1c572f22eda1bd6ed7c4..3b1578cddfd97b31cae8619cdd2d8e1997585f51 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -422,7 +422,7 @@ class TestWholeModelSaving(test.TestCase):
         f = keras.layers.Dense(2, name='nested_model_dense_%d' % (i,))(f)
       # This layer name will make the `weights_name`
       # HDF5 attribute blow out of proportion.
-      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**15)))(f)
+      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**14)))(f)
       nested_model = keras.Model(inputs=[x], outputs=[f], name='nested_model')
 
       x = keras.Input(shape=(2,), name='outer_model_input')
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
index 2ef99d5ab3f432058fdf685b99b01aa0b5eeffdc..bd13ca671340551c3e96895951be360b15e55cfe 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -123,7 +123,7 @@ class Sequential(Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
-    if not isinstance(layer, (base_layer.Layer, base_layer.TFBaseLayer)):
+    if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
index c9a47581df03e0fc1ad38552ba8634862435cd80..8aba16aef3e187e9e33bdb65c7d44b0e622730ef 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
@@ -151,6 +151,7 @@ class TestSequential(test.TestCase):
     with self.test_session():
       model = keras.models.Sequential()
       model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+      assert model.updates
 
       model.trainable = False
       assert not model.updates
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 9ab4b6fdcf55cc6186b96dd4e747f3600a4f78f8..49cc1cd3b38325b4f42d5b26bac9442d7cc09b05 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -52,11 +52,13 @@ class TopologyConstructionTest(test.TestCase):
                                    (1, 1),
                                    'float32',
                                    trainable=False)
-        self.add_update(state_ops.assign_add(self.a, [[1.]]))
+        self.add_update(state_ops.assign_add(self.a, [[1.]],
+                                             name='unconditional_update'))
         self.built = True
 
       def call(self, inputs):
-        self.add_update(state_ops.assign_add(self.a, inputs),
+        self.add_update(state_ops.assign_add(self.b, inputs,
+                                             name='conditional_update'),
                         inputs=True)
         return inputs + 1
 
@@ -97,10 +99,20 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 4)
     self.assertEqual(len(network.get_updates_for(None)), 2)
 
-    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
+    network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  def test_get_updates_bn(self):
+    x1 = keras.Input(shape=(1,))
+    layer = keras.layers.BatchNormalization()
+    _ = layer.apply(x1)
+
+    print('BN updates', layer._updates)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 0)
+
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -875,25 +887,25 @@ class TopologyConstructionTest(test.TestCase):
 class DeferredModeTest(test.TestCase):
 
   def testDeferredTensorAttributes(self):
-    x = tf_base_layers._DeferredTensor(shape=(None, 2),
-                                       dtype='float32',
-                                       name='x')
+    x = base_layer.DeferredTensor(shape=(None, 2),
+                                  dtype='float32',
+                                  name='x')
     self.assertEqual(str(x),
                      'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
     self.assertEqual(repr(x),
-                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
+                     '<DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
 
   @test_util.run_in_graph_and_eager_modes()
   def testSimpleNetworkBuilding(self):
     inputs = keras.engine.Input(shape=(32,))
     if context.executing_eagerly():
-      self.assertIsInstance(inputs, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(inputs, base_layer.DeferredTensor)
       self.assertEqual(inputs.dtype.name, 'float32')
       self.assertEqual(inputs.shape.as_list(), [None, 32])
 
     x = keras.layers.Dense(2)(inputs)
     if context.executing_eagerly():
-      self.assertIsInstance(x, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(x, base_layer.DeferredTensor)
       self.assertEqual(x.dtype.name, 'float32')
       self.assertEqual(x.shape.as_list(), [None, 2])
 
@@ -936,5 +948,34 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[0].shape.as_list(), [10, 16])
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5, x_3.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([x_3]),
+          {x_3, x_5, x_5.op})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 71de657da81b92a2fc6b1eef9041147be6ff307e..7c4674381458d758939cc34344d7e11bf5941c3c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -31,10 +31,10 @@ from tensorflow.python.keras._impl.keras.engine import training_arrays
 from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine import training_generator
 from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.layers.base import _DeferredTensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
@@ -891,15 +891,6 @@ class Model(Network):
     else:
       self._symbolic_set_inputs(inputs, training=training)
 
-  def _set_scope(self, scope=None):
-    """Modify the Layer scope creation logic to create ResourceVariables."""
-    super(Model, self)._set_scope(scope=scope)
-    # Subclassed Models create ResourceVariables by default. This makes it
-    # easier to use Models in an eager/graph agnostic way (since eager execution
-    # always uses ResourceVariables).
-    if not self._is_graph_network:
-      self._scope.set_use_resource(True)
-
   def _eager_set_inputs(self, inputs):
     """Set model's input and output specs based on the input data received.
 
@@ -933,11 +924,11 @@ class Model(Network):
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_output_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_input_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 08fd26dd18d5bc1b171d780be133f02f51b9c248..6699fd5212fe5f5748215a6a5e25dc803d3f2fd4 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -23,10 +23,12 @@ import unittest
 
 import numpy as np
 
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 try:
@@ -1140,6 +1142,21 @@ class TestTrainingWithDataTensors(test.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=(inputs, targets), validation_steps=2)
 
+      # Test with dynamic shape
+      inputs = array_ops.placeholder_with_default(
+          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+      targets = array_ops.placeholder_with_default(
+          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+      self.assertEqual(inputs.shape[0].value, None)
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index a3fc8ef2a0359c527a2757c1888d61822e35d7a9..48afe48e6c003152d31cda7ed1200b4c783cec8f 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -61,22 +61,21 @@ def check_num_samples(ins,
   Raises:
       ValueError: In case of invalid arguments.
   """
-  if steps is not None:
-    num_samples = None
-    if batch_size is not None:
-      raise ValueError(
-          'If ' + steps_name + ' is set, the `batch_size` must be None.')
-  if has_symbolic_tensors(ins) and steps is None:
-    raise ValueError('If your data is in the form of symbolic tensors, '
-                     'you should specify the `' + steps_name + '` argument '
-                     '(instead of the `batch_size` argument).')
-  if ins and hasattr(ins[0], 'shape'):
-    num_samples = int(ins[0].shape[0])
-  elif steps is None:
+  if steps is not None and batch_size is not None:
     raise ValueError(
-        'Either the input data should have '
-        'a defined shape, or ' + steps_name + ' should be specified.')
-  return num_samples
+        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+
+  if not ins or has_symbolic_tensors(ins):
+    if steps is None:
+      raise ValueError('If your data is in the form of symbolic tensors, '
+                       'you should specify the `' + steps_name + '` argument '
+                       '(instead of the `batch_size` argument, '
+                       'because symbolic tensors are expected to produce '
+                       'batches of input data).')
+    return None
+  if hasattr(ins[0], 'shape'):
+    return int(ins[0].shape[0])
+  return None  # Edge case where ins == [static_learning_phase]
 
 
 def standardize_single_array(x):
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 5d370ebbb5f31d102c381e46bb8f696e151f492b..b922a6c68399e6c0c43b98c54ff0f550326c2199 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
@@ -54,6 +55,19 @@ def _cast_tensor_to_floatx(x):
     return math_ops.cast(x, K.floatx())
 
 
+def _any_variable_initalized():
+  """Check if any variable has been initialized in the Keras model.
+
+  Returns:
+    boolean, True if at least one variable has been initalized, else False.
+  """
+  variables = variables_module.global_variables()
+  for v in variables:
+    if getattr(v, '_keras_initialized', False):
+      return True
+  return False
+
+
 def _create_ordered_io(keras_model, estimator_io, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
@@ -395,7 +409,8 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
                                      custom_objects)
       # save to checkpoint
       with session.Session(config=estimator._session_config) as sess:
-        model.set_weights(keras_weights)
+        if keras_weights:
+          model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
         if not model.train_function:
           # pylint: disable=protected-access
@@ -465,11 +480,22 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
-  # Pass the config into keras backend's default session.
-  sess = session.Session(config=estimator._session_config)
-  K.set_session(sess)
+  # Check if we need to call get_weights:
+  if _any_variable_initalized():
+    keras_weights = keras_model.get_weights()
+    # Warn if config passed to estimator tries to update GPUOptions. If a
+    # session has already been created, the GPUOptions passed to the first
+    # session sticks.
+    if estimator._session_config.HasField('gpu_options'):
+      logging.warning(
+          'The Keras backend session has already been set. '
+          'The _session_config passed to model_to_estimator will not be used.')
+  else:
+    # Pass the config into keras backend's default session.
+    sess = session.Session(config=estimator._session_config)
+    K.set_session(sess)
+    keras_weights = None
 
-  keras_weights = keras_model.get_weights()
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
     _save_first_checkpoint(keras_model,
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index e076dc25b16900636313f0ddd85a61b8d917fc91..653cdc01e245666f2b8fc1e742bb8ef71512d279 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -27,10 +27,12 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
+from tensorflow.python.keras._impl.keras.optimizers import SGD
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -443,8 +445,9 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    est_keras = keras.estimator.model_to_estimator(
-        keras_model=model, config=self._config)
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, config=self._config)
 
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -497,20 +500,42 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
   def test_gpu_config(self):
+    with ops.Graph().as_default():
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['mse', keras.metrics.categorical_accuracy])
+
+      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
+      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
+      self._config._session_config = sess_config
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      self.assertEqual(
+          keras.backend.get_session()
+          ._config.gpu_options.per_process_gpu_memory_fraction,
+          gpu_options.per_process_gpu_memory_fraction)
+
+  def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
     keras_model.compile(
         loss='categorical_crossentropy',
-        optimizer='rmsprop',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
         metrics=['mse', keras.metrics.categorical_accuracy])
-
-    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
-    sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
-    self._config._session_config = sess_config
-    keras.estimator.model_to_estimator(
-        keras_model=keras_model, config=self._config)
-    self.assertEqual(keras.backend.get_session()
-                     ._config.gpu_options.per_process_gpu_memory_fraction,
-                     gpu_options.per_process_gpu_memory_fraction)
+    with self.test_session():
+      keras_model.train_on_batch(
+          np.random.random((10,) + _INPUT_SIZE),
+          np.random.random((10, _NUM_CLASS)))
+      weights = keras_model.get_weights()
+      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+      keras_model.set_weights(weights)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=SGD(lr=0.0001, momentum=0.9),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+      keras.estimator.model_to_estimator(
+          keras_model=keras_model, config=self._config)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 300bed5e1437074d010760c427c14f68e58ac363..ecb71d00e2c78ced6095aaa3a0180b454b04917a 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -201,6 +201,8 @@ def deserialize(config, custom_objects=None):
 
 @tf_export('keras.initializers.get')
 def get(identifier):
+  if identifier is None:
+    return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 280f7ed1b11e2026ac196eb319f7d5da8301f060..43aff67ef93c8ec495beafdd17c5557b6398671f 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -29,16 +29,15 @@ from tensorflow.python.platform import test
 
 class KerasIntegrationTest(test.TestCase):
 
-  def test_vector_classification_declarative(self):
+  def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -48,23 +47,22 @@ class KerasIntegrationTest(test.TestCase):
           keras.layers.Dense(y_train.shape[-1], activation='softmax')
       ])
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(10,),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(20,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.Dense(16, activation='relu')(inputs)
@@ -73,77 +71,78 @@ class KerasIntegrationTest(test.TestCase):
 
       model = keras.models.Model(inputs, outputs)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_temporal_classification_declarative(self):
+  def test_temporal_classification_sequential(self):
     with self.test_session():
-      np.random.seed(1336)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(4, 8),
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.LSTM(5, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_image_classification_declarative(self):
+  def test_image_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(8, 8, 3),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(12, 12, 3),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.Conv2D(
-          8, 3,
+          4, 3,
+          padding='same',
           activation='relu',
           input_shape=x_train.shape[1:]))
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Conv2D(
           8, 3,
           padding='same',
           activation='relu'))
-      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Conv2D(
+          16, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.Flatten())
       model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_video_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(4, 8, 8, 3),
           num_classes=3)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.TimeDistributed(
@@ -159,22 +158,21 @@ class KerasIntegrationTest(test.TestCase):
                     optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.70)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       base_model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -189,27 +187,26 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.84)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(x_train.shape[1:])
       x = keras.layers.Dense(16,
@@ -225,12 +222,12 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_embedding_with_clipnorm(self):
     with self.test_session():
@@ -242,9 +239,9 @@ class KerasIntegrationTest(test.TestCase):
   def test_using_tf_layers_in_keras_sequential_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
 
@@ -254,25 +251,23 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_using_tf_layers_in_keras_functional_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.Input(shape=(10,))
       x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
@@ -281,12 +276,12 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 7cdebc6aa4f8460d14bef3edb9edf7e4db5066c5..12b965587f5695fe410d2363956f54386c9fa8cf 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -19,9 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
@@ -38,12 +39,232 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Conv(Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(
+        kernel_size, rank, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(
+        dilation_rate, rank, 'dilation_rate')
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=self.kernel.get_shape(),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format,
+                                                   self.rank + 2))
+    self.built = True
+
+  def call(self, inputs):
+    outputs = self._convolution_op(inputs, self.kernel)
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
+        if self.rank == 3:
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          if outputs_shape[0] is None:
+            outputs_shape[0] = -1
+          outputs_4d = array_ops.reshape(outputs,
+                                         [outputs_shape[0], outputs_shape[1],
+                                          outputs_shape[2] * outputs_shape[3],
+                                          outputs_shape[4]])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
+      else:
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
-class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+class Conv1D(Conv):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -74,6 +295,8 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
           where the model should not violate the temporal order.
           See [WaveNet: A Generative Model for Raw Audio, section
             2.1](https://arxiv.org/abs/1609.03499).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
       dilation_rate: an integer or tuple/list of a single integer, specifying
           the dilation rate to use for dilated convolution.
           Currently, specifying any `dilation_rate` value != 1 is
@@ -105,6 +328,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                kernel_size,
                strides=1,
                padding='valid',
+               data_format='channels_last',
                dilation_rate=1,
                activation=None,
                use_bias=True,
@@ -117,11 +341,12 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                bias_constraint=None,
                **kwargs):
     super(Conv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
-        data_format='channels_last',
+        data_format=data_format,
         dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
@@ -134,30 +359,9 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
-class Conv2D(tf_convolutional_layers.Conv2D, Layer):
+class Conv2D(Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -247,9 +451,8 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -267,31 +470,9 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
-class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+class Conv3D(Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -388,9 +569,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3D, self).__init__(
+        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -408,32 +588,10 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2DTranspose',
            'keras.layers.Convolution2DTranspose')
-class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
+class Conv2DTranspose(Conv2D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -529,8 +687,6 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -548,31 +704,124 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2DTranspose, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+      strides = (1, 1, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+      strides = (1, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
-class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
+class Conv3DTranspose(Conv3D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -679,8 +928,6 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -698,6 +945,314 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = int(input_shape[channel_axis])
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+    self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = conv_utils.deconv_output_length(depth,
+                                                kernel_d,
+                                                self.padding,
+                                                stride_d)
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
+                                                          kernel_d,
+                                                          self.padding,
+                                                          stride_d)
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[d_axis] = conv_utils.deconv_output_length(
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
+
+class SeparableConv(Conv):
+  """Abstract base layer for separable nD convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.pointwise_initializer = initializers.get(pointwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    depthwise_kernel_shape = self.kernel_size + (input_dim,
+                                                 self.depth_multiplier)
+    pointwise_kernel_shape = (
+        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
+
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
+        shape=pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        regularizer=self.pointwise_regularizer,
+        constraint=self.pointwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    raise NotImplementedError
+
   def get_config(self):
     config = {
         'filters': self.filters,
@@ -705,24 +1260,34 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         'strides': self.strides,
         'padding': self.padding,
         'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
         'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
         'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint)
     }
-    base_config = super(Conv3DTranspose, self).get_config()
+    base_config = super(SeparableConv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.SeparableConv1D',
            'keras.layers.SeparableConvolution1D')
-class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+class SeparableConv1D(SeparableConv):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -802,15 +1367,15 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -825,44 +1390,46 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides * 2 + (1,)
+      spatial_start_dim = 1
+    else:
+      strides = (1, 1) + self.strides * 2
+      spatial_start_dim = 2
+
+    # Explicitly broadcast inputs and kernels to 4D.
+    # TODO(fchollet): refactor when a native separable_conv1d op is available.
+    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
+    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
+    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
+    dilation_rate = (1,) + self.dilation_rate
+
+    outputs = nn.separable_conv2d(
+        inputs,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.SeparableConv2D',
            'keras.layers.SeparableConvolution2D')
-class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
+class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
   Separable convolutions consist in first performing
@@ -959,15 +1526,15 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -982,47 +1549,30 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    # Apply the actual ops.
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides + (1,)
+    else:
+      strides = (1, 1) + self.strides
+    outputs = nn.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=self.dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.DepthwiseConv2D')
@@ -1162,7 +1712,7 @@ class DepthwiseConv2D(Conv2D):
     self.built = True
 
   def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
+    outputs = backend.depthwise_conv2d(
         inputs,
         self.depthwise_kernel,
         strides=self.strides,
@@ -1171,7 +1721,7 @@ class DepthwiseConv2D(Conv2D):
         data_format=self.data_format)
 
     if self.bias:
-      outputs = K.bias_add(
+      outputs = backend.bias_add(
           outputs,
           self.bias,
           data_format=self.data_format)
@@ -1246,7 +1796,7 @@ class UpSampling1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
 
   def call(self, inputs):
-    output = K.repeat_elements(inputs, self.size, axis=1)
+    output = backend.repeat_elements(inputs, self.size, axis=1)
     return output
 
   def get_config(self):
@@ -1315,7 +1865,8 @@ class UpSampling2D(Layer):
           [input_shape[0], height, width, input_shape[3]])
 
   def call(self, inputs):
-    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+    return backend.resize_images(
+        inputs, self.size[0], self.size[1], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1387,8 +1938,8 @@ class UpSampling3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
-                            self.data_format)
+    return backend.resize_volumes(
+        inputs, self.size[0], self.size[1], self.size[2], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1429,7 +1980,7 @@ class ZeroPadding1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
   def call(self, inputs):
-    return K.temporal_padding(inputs, padding=self.padding)
+    return backend.temporal_padding(inputs, padding=self.padding)
 
   def get_config(self):
     config = {'padding': self.padding}
@@ -1530,7 +2081,7 @@ class ZeroPadding2D(Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
   def call(self, inputs):
-    return K.spatial_2d_padding(
+    return backend.spatial_2d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
@@ -1648,7 +2199,7 @@ class ZeroPadding3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.spatial_3d_padding(
+    return backend.spatial_3d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index c74fc1e4c0a764b4cc0d09129be4e5287a9bdd05..9c4cb0f4fda681ce3236222460cd87439ea67810 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -24,6 +24,7 @@ import types as python_types
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -32,13 +33,14 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -94,7 +96,7 @@ class Masking(Layer):
 
 
 @tf_export('keras.layers.Dropout')
-class Dropout(tf_core_layers.Dropout, Layer):
+class Dropout(Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting
@@ -113,23 +115,40 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    # Inheritance call order:
-    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  **kwargs)
+    super(Dropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
     self.supports_masking = True
 
+  def _get_noise_shape(self, inputs):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    if self.noise_shape is None:
+      return self.noise_shape
+    return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
+
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
-    output = super(Dropout, self).call(inputs, training=training)
+
+    def dropped_inputs():
+      return nn.dropout(inputs, 1  - self.rate,
+                        noise_shape=self._get_noise_shape(inputs),
+                        seed=self.seed)
+    output = tf_utils.smart_cond(training,
+                                 dropped_inputs,
+                                 lambda: array_ops.identity(inputs))
     # EagerTensor object has no attribute _uses_learning_phase
-    if not context.executing_eagerly() and training is K.learning_phase():
+    if not context.executing_eagerly() and original_training_value is None:
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {
         'rate': self.rate,
@@ -479,7 +498,7 @@ class Permute(Layer):
 
 
 @tf_export('keras.layers.Flatten')
-class Flatten(tf_core_layers.Flatten, Layer):
+class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
   Example:
@@ -495,7 +514,25 @@ class Flatten(tf_core_layers.Flatten, Layer):
       # now: model.output_shape == (None, 65536)
   ```
   """
-  pass
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def call(self, inputs):
+    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    if not context.executing_eagerly():
+      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    if all(input_shape[1:]):
+      output_shape += [np.prod(input_shape[1:])]
+    else:
+      output_shape += [None]
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.RepeatVector')
@@ -611,10 +648,12 @@ class Lambda(Layer):
                         'must be a list, a tuple, or a function.')
       self._output_shape = output_shape
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     if self._output_shape is None:
+      if context.executing_eagerly():
+        raise NotImplementedError
       x = K.placeholder(shape=input_shape)
       x = self.call(x)
       if isinstance(x, list):
@@ -640,7 +679,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    if has_arg(self.function, 'mask'):
+    if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -651,14 +690,14 @@ class Lambda(Layer):
 
   def get_config(self):
     if isinstance(self.function, python_types.LambdaType):
-      function = func_dump(self.function)
+      function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
     else:
       function = self.function.__name__
       function_type = 'function'
 
     if isinstance(self._output_shape, python_types.LambdaType):
-      output_shape = func_dump(self._output_shape)
+      output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
@@ -686,26 +725,27 @@ class Lambda(Layer):
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
-      function = deserialize_keras_object(
+      function = generic_utils.deserialize_keras_object(
           config['function'],
           custom_objects=custom_objects,
           printable_module_name='function in Lambda layer')
     elif function_type == 'lambda':
       # Unsafe deserialization from bytecode
-      function = func_load(config['function'], globs=globs)
+      function = generic_utils.func_load(config['function'], globs=globs)
     else:
       raise TypeError('Unknown function type:', function_type)
 
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
-      output_shape = deserialize_keras_object(
+      output_shape = generic_utils.deserialize_keras_object(
           config['output_shape'],
           custom_objects=custom_objects,
           printable_module_name='output_shape function in Lambda layer')
     elif output_shape_type == 'lambda':
       # Unsafe deserialization from bytecode
-      output_shape = func_load(config['output_shape'], globs=globs)
+      output_shape = generic_utils.func_load(config['output_shape'],
+                                             globs=globs)
     else:
       output_shape = config['output_shape']
 
@@ -725,7 +765,7 @@ class Lambda(Layer):
 
 
 @tf_export('keras.layers.Dense')
-class Dense(tf_core_layers.Dense, Layer):
+class Dense(Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -795,21 +835,74 @@ class Dense(tf_core_layers.Dense, Layer):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 
-    # Inheritance call order:
-    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dense, self).__init__(
-        units,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    self.units = int(units)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
     self.supports_masking = True
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if input_shape[-1].value is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    self.input_spec = InputSpec(min_ndim=2,
+                                axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    if self.use_bias:
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    shape = inputs.get_shape().as_list()
+    if len(shape) > 2:
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
+                                                             [0]])
+      # Reshape the output back to the original ndim of the input.
+      if not context.executing_eagerly():
+        output_shape = shape[:-1] + [self.units]
+        outputs.set_shape(output_shape)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    if self.use_bias:
+      outputs = nn.bias_add(outputs, self.bias)
+    if self.activation is not None:
+      return self.activation(outputs)  # pylint: disable=not-callable
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 551d1b1c3a0a80ed81ad03afc4c9510a231e33ef..d22d8d12dc4e76998c177dbe96fb87e3fffa5175 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -129,7 +129,6 @@ class CoreLayersTest(test.TestCase):
     testing_utils.layer_test(
         keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes()
   def test_lambda(self):
     testing_utils.layer_test(
         keras.layers.Lambda,
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 540e2d945c986aebbd7028e4a1f2e4566747320f..591bab7cd86aefcad58d47eafbd061da4ca04b83 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -102,7 +102,8 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(**kwargs)
+    dtype = kwargs.pop('dtype', K.floatx())
+    super(Embedding, self).__init__(dtype=dtype, **kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -120,8 +121,7 @@ class Embedding(Layer):
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        dtype=self.dtype)
+        constraint=self.embeddings_constraint)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 26fd1f1c114587c2f1b3e0155f1259dd5f0dcf60..9f6793eac85854ea82d36b425a883f9abe54f1eb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.platform import test
 
 class EmbeddingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def test_embedding(self):
     testing_utils.layer_test(
         keras.layers.Embedding,
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 3b44b20bf822429351002c0f81fe8f9596d595d3..c16fc07fb4ecda66bd8bcc70dce5d753c73f5dd9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -19,17 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
+class BatchNormalization(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -37,28 +48,63 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   close to 0 and the activation standard deviation close to 1.
 
   Arguments:
-      axis: Integer, the axis that should be normalized
-          (typically the features axis).
-          For instance, after a `Conv2D` layer with
-          `data_format="channels_first"`,
-          set `axis=1` in `BatchNormalization`.
-      momentum: Momentum for the moving average.
-      epsilon: Small float added to variance to avoid dividing by zero.
-      center: If True, add offset of `beta` to normalized tensor.
-          If False, `beta` is ignored.
-      scale: If True, multiply by `gamma`.
-          If False, `gamma` is not used.
-          When the next layer is linear (also e.g. `nn.relu`),
-          this can be disabled since the scaling
-          will be done by the next layer.
-      beta_initializer: Initializer for the beta weight.
-      gamma_initializer: Initializer for the gamma weight.
-      moving_mean_initializer: Initializer for the moving mean.
-      moving_variance_initializer: Initializer for the moving variance.
-      beta_regularizer: Optional regularizer for the beta weight.
-      gamma_regularizer: Optional regularizer for the gamma weight.
-      beta_constraint: Optional constraint for the beta weight.
-      gamma_constraint: Optional constraint for the gamma weight.
+    axis: Integer, the axis that should be normalized
+        (typically the features axis).
+        For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`,
+        set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random_uniform(shape[-1:], 0.93, 1.07),
+          tf.random_uniform(shape[-1:], -0.1, 0.1))`
+      will scale the normalized value by up to 7% up or down, then shift the
+      result by up to 0.1 (with independent scaling and bias for each feature
+      but shared across all examples), and finally apply gamma and/or beta. If
+      `None`, no adjustment is applied. Cannot be specified if
+      virtual_batch_size is specified.
 
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
@@ -87,33 +133,508 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
                gamma_regularizer=None,
                beta_constraint=None,
                gamma_constraint=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=None,
+               trainable=True,
+               virtual_batch_size=None,
+               adjustment=None,
+               name=None,
                **kwargs):
-    self.supports_masking = True
     super(BatchNormalization, self).__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=initializers.get(beta_initializer),
-        gamma_initializer=initializers.get(gamma_initializer),
-        moving_mean_initializer=initializers.get(moving_mean_initializer),
-        moving_variance_initializer=initializers.get(
-            moving_variance_initializer),
-        beta_regularizer=regularizers.get(beta_regularizer),
-        gamma_regularizer=regularizers.get(gamma_regularizer),
-        beta_constraint=constraints.get(beta_constraint),
-        gamma_constraint=constraints.get(gamma_constraint),
-        **kwargs
-    )
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+    self.moving_variance_initializer = initializers.get(
+        moving_variance_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+    self.renorm = renorm
+    self.virtual_batch_size = virtual_batch_size
+    self.adjustment = adjustment
+    if fused is None:
+      fused = True
+    self.supports_masking = True
+
+    self.fused = fused
+    self._bessels_correction_test_only = True
+
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
+
+  def _add_tower_local_variable(self, *args, **kwargs):
+    tower_context = distribute_lib.get_tower_context()
+    with tower_context.tower_local_var_scope('mean'):
+      return self.add_variable(*args, **kwargs)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not input_shape.ndims:
+      raise ValueError('Input has undefined rank:', input_shape)
+    ndims = len(input_shape)
+
+    # Convert axis to list and resolve negatives
+    if isinstance(self.axis, int):
+      self.axis = [self.axis]
+
+    if not isinstance(self.axis, list):
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
+
+    for idx, x in enumerate(self.axis):
+      if x < 0:
+        self.axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.axis) != len(set(self.axis)):
+      raise ValueError('Duplicate axis: %s' % self.axis)
+
+    if self.virtual_batch_size is not None:
+      if self.virtual_batch_size <= 0:
+        raise ValueError('virtual_batch_size must be a positive integer that '
+                         'divides the true batch size of the input Tensor')
+      # If using virtual batches, the first dimension must be the batch
+      # dimension and cannot be the batch norm axis
+      if 0 in self.axis:
+        raise ValueError('When using virtual_batch_size, the batch dimension '
+                         'must be 0 and thus axis cannot include 0')
+      if self.adjustment is not None:
+        raise ValueError('When using virtual_batch_size, adjustment cannot '
+                         'be specified')
+
+    if self.fused:
+      # Currently fused batch norm doesn't support renorm. It also only supports
+      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
+      # output back to its original shape accordingly.
+      self.fused = (not self.renorm and
+                    ndims == 4 and
+                    self.axis in [[1], [3]] and
+                    self.virtual_batch_size is None and
+                    self.adjustment is None)
+      # TODO(chrisying): fused batch norm is currently not supported for
+      # multi-axis batch norm and by extension virtual batches. In some cases,
+      # it might be possible to use fused batch norm but would require reshaping
+      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
+      # particularly tricky. A compromise might be to just support the most
+      # common use case (turning 5D w/ virtual batch to NCHW)
+
+    if self.fused:
+      if self.axis == [1]:
+        self._data_format = 'NCHW'
+      elif self.axis == [3]:
+        self._data_format = 'NHWC'
+      else:
+        raise ValueError('Unsupported axis, fused batch norm only supports '
+                         'axis == [1] or axis == [3]')
+
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
+    axis_to_dim = {x: input_shape[x].value for x in self.axis}
+    for x in axis_to_dim:
+      if axis_to_dim[x] is None:
+        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
+                         input_shape)
+    self.input_spec = InputSpec(ndim=ndims, axes=axis_to_dim)
+
+    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
+      # Single axis batch norm (most common/default use-case)
+      param_shape = (list(axis_to_dim.values())[0],)
+    else:
+      # Parameter shape is the original shape but with 1 in all non-axis dims
+      param_shape = [axis_to_dim[i] if i in axis_to_dim
+                     else 1 for i in range(ndims)]
+      if self.virtual_batch_size is not None:
+        # When using virtual batches, add an extra dim at index 1
+        param_shape.insert(1, 1)
+        for idx, x in enumerate(self.axis):
+          self.axis[idx] = x + 1      # Account for added dimension
+
+    if self.scale:
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
+    else:
+      self.gamma = None
+      if self.fused:
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
+
+    if self.center:
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
+    else:
+      self.beta = None
+      if self.fused:
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
+
+    try:
+      # Disable variable partitioning when creating the moving mean and variance
+      if hasattr(self, '_scope') and self._scope:
+        partitioner = self._scope.partitioner
+        self._scope.set_partitioner(None)
+      else:
+        partitioner = None
+      self.moving_mean = self._add_tower_local_variable(
+          name='moving_mean',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_mean_initializer,
+          trainable=False)
+
+      self.moving_variance = self._add_tower_local_variable(
+          name='moving_variance',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_variance_initializer,
+          trainable=False)
+
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self._add_tower_local_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
+          return var
+
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_mean):
+          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
+          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+        # renorm_stddev_weight. This allows us to (1) mix the average
+        # stddev with the minibatch stddev early in training, and (2) compute
+        # the unbiased average stddev by dividing renorm_stddev by the weight.
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_variance):
+          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
+          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
+                                                       ())
+    finally:
+      if partitioner:
+        self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _assign_moving_average(self, variable, value, momentum):
+    with ops.name_scope(None, 'AssignMovingAvg',
+                        [variable, value, momentum]) as scope:
+      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
+      if decay.dtype != variable.dtype.base_dtype:
+        decay = math_ops.cast(decay, variable.dtype.base_dtype)
+      update_delta = (variable - value) * decay
+      return state_ops.assign_sub(variable, update_delta, name=scope)
+
+  def _fused_batch_norm(self, inputs, training):
+    """Returns the output of fused batch norm."""
+    beta = self.beta if self.center else self._beta_const
+    gamma = self.gamma if self.scale else self._gamma_const
+
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          epsilon=self.epsilon,
+          data_format=self._data_format)
+
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=self.moving_mean,
+          variance=self.moving_variance,
+          epsilon=self.epsilon,
+          is_training=False,
+          data_format=self._data_format)
+
+    output, mean, variance = tf_utils.smart_cond(
+        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+    if not self._bessels_correction_test_only:
+      # Remove Bessel's correction to be consistent with non-fused batch norm.
+      # Note that the variance computed by fused batch norm is
+      # with Bessel's correction.
+      sample_size = math_ops.cast(
+          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
+      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
+      variance *= factor
+
+    training_value = tf_utils.constant_value(training)
+    if training_value is None:
+      momentum = tf_utils.smart_cond(training,
+                                     lambda: self.momentum,
+                                     lambda: 1.0)
+    else:
+      momentum = ops.convert_to_tensor(self.momentum)
+    if training_value or training_value is None:
+      mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                momentum)
+      variance_update = self._assign_moving_average(self.moving_variance,
+                                                    variance, momentum)
+      self.add_update(mean_update, inputs=True)
+      self.add_update(variance_update, inputs=True)
+
+    return output
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0.
+    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = tf_utils.smart_cond(training,
+                            lambda: d,
+                            lambda: array_ops.zeros_like(d))
+
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      value = array_ops.identity(value)
+      def _do_update():
+        """Updates the var and weight, returns their updated ratio."""
+        # Update the variables without zero debiasing. The debiasing will be
+        # accomplished by dividing the exponential moving average by the weight.
+        # For example, after a single update, the moving average would be
+        # (1-decay) * value. and the weight will be 1-decay, with their ratio
+        # giving the value.
+        # Make sure the weight is not updated until before r and d computation.
+        with ops.control_dependencies([value]):
+          weight_value = array_ops.constant(1., dtype=weight.dtype)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_weight = self._assign_moving_average(weight, weight_value,
+                                                 self.renorm_momentum)
+        # TODO(yuefengz): the updates to var and weighted can not be batched
+        # together if we fetch their updated values here. Consider calculating
+        # new values and delaying the updates.
+        return new_var / new_weight
+
+      def _fake_update():
+        return array_ops.identity(var)
+      return tf_utils.smart_cond(training, _do_update, _fake_update)
+
+    # TODO(yuefengz): colocate the operations
+    new_mean = _update_renorm_variable(self.renorm_mean,
+                                       self.renorm_mean_weight, mean)
+    new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                         self.renorm_stddev_weight, stddev)
+    # Make sqrt(moving_variance + epsilon) = new_stddev.
+    new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
+    original_training_value = training
     if training is None:
       training = K.learning_phase()
-    output = super(BatchNormalization, self).call(inputs, training=training)
-    if not context.executing_eagerly() and training is K.learning_phase():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
+
+    in_eager_mode = context.executing_eagerly()
+    if self.virtual_batch_size is not None:
+      # Virtual batches (aka ghost batches) can be simulated by reshaping the
+      # Tensor and reusing the existing batch norm implementation
+      original_shape = [-1] + inputs.shape.as_list()[1:]
+      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
+
+      # Will cause errors if virtual_batch_size does not divide the batch size
+      inputs = array_ops.reshape(inputs, expanded_shape)
+
+      def undo_virtual_batching(outputs):
+        outputs = array_ops.reshape(outputs, original_shape)
+        return outputs
+
+    if self.fused:
+      outputs = self._fused_batch_norm(inputs, training=training)
+      if self.virtual_batch_size is not None:
+        # Currently never reaches here since fused_batch_norm does not support
+        # virtual batching
+        outputs = undo_virtual_batching(outputs)
+      if not context.executing_eagerly() and original_training_value is None:
+        outputs._uses_learning_phase = True  # pylint: disable=protected-access
+      return outputs
+
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+    reduction_axes = [i for i in range(ndims) if i not in self.axis]
+    if self.virtual_batch_size is not None:
+      del reduction_axes[1]     # Do not reduce along virtual batch dim
+
+    # Broadcasting only necessary for single-axis batch norm where the axis is
+    # not the last dimension
+    broadcast_shape = [1] * ndims
+    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          reduction_axes != list(range(ndims - 1))):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    def _compose_transforms(scale, offset, then_scale, then_offset):
+      if then_scale is not None:
+        scale *= then_scale
+        offset *= then_scale
+      if then_offset is not None:
+        offset += then_offset
+      return (scale, offset)
+
+    # Determine a boolean value for `training`: could be True, False, or None.
+    training_value = tf_utils.constant_value(training)
+    if training_value is not False:
+      if self.adjustment:
+        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
+        # Adjust only during training.
+        adj_scale = tf_utils.smart_cond(training,
+                                        lambda: adj_scale,
+                                        lambda: array_ops.ones_like(adj_scale))
+        adj_bias = tf_utils.smart_cond(training,
+                                       lambda: adj_bias,
+                                       lambda: array_ops.zeros_like(adj_bias))
+        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
+
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
+      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
+      moving_mean = self.moving_mean
+      moving_variance = self.moving_variance
+
+      mean = tf_utils.smart_cond(training,
+                                 lambda: mean,
+                                 lambda: moving_mean)
+      variance = tf_utils.smart_cond(training,
+                                     lambda: variance,
+                                     lambda: moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
+        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
+        scale, offset = _compose_transforms(r, d, scale, offset)
+      else:
+        new_mean, new_variance = mean, variance
+
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(new_mean,
+                                        axis=1, keepdims=True)
+        new_variance = math_ops.reduce_mean(new_variance,
+                                            axis=1, keepdims=True)
+
+      def _do_update(var, value):
+        if in_eager_mode and not self.trainable:
+          return
+
+        return self._assign_moving_average(var, value, self.momentum)
+
+      mean_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_mean, new_mean),
+          lambda: self.moving_mean)
+      variance_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_variance, new_variance),
+          lambda: self.moving_variance)
+      if not context.executing_eagerly():
+        self.add_update(mean_update, inputs=True)
+        self.add_update(variance_update, inputs=True)
+
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
+
+    outputs = nn.batch_normalization(inputs,
+                                     _broadcast(mean),
+                                     _broadcast(variance),
+                                     offset,
+                                     scale,
+                                     self.epsilon)
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    if self.virtual_batch_size is not None:
+      outputs = undo_virtual_batching(outputs)
+    if not context.executing_eagerly() and original_training_value is None:
+      outputs._uses_learning_phase = True  # pylint: disable=protected-access
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
   def get_config(self):
     config = {
@@ -133,5 +654,19 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
         'beta_constraint': constraints.serialize(self.beta_constraint),
         'gamma_constraint': constraints.serialize(self.gamma_constraint)
     }
+    # Only add TensorFlow-specific parameters if they are set, so as to preserve
+    # model compatibility with external Keras.
+    if self.renorm:
+      config['renorm'] = True
+      config['renorm_clipping'] = self.renorm_clipping
+      config['renorm_momentum'] = self.renorm_momentum
+    if self.virtual_batch_size is not None:
+      config['virtual_batch_size'] = self.virtual_batch_size
+    # Note: adjustment is not serializable.
+    if self.adjustment is not None:
+      logging.warning('The `adjustment` function of this `BatchNormalization` '
+                      'layer cannot be serialized and has been omitted from '
+                      'the layer config. It will not be included when '
+                      're-creating the layer from the saved config.')
     base_config = super(BatchNormalization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 2b3628c3f1023612297465bdf3286246261992a2..fa9277e3d1e5bb0b9633abc46a96a11816dddb2d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -114,6 +114,26 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  def test_batchnorm_convnet_channel_last(self):
+    with self.test_session():
+      # keras.backend.set_learning_phase(True)
+
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=-1, input_shape=(4, 4, 3), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
     """
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index 15d53379769d8142f5b2755a07479f60751346d2..86bc8a680a529a9ea17592a42207fab58adeebce 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -19,16 +19,98 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Pooling1D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling1D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
+    if strides is None:
+      strides = pool_size
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=3)
+
+  def call(self, inputs):
+    # There is no TF op for 1D pooling, hence we make the inputs 4D.
+    if self.data_format == 'channels_last':
+      # input is NWC, make it NHWC
+      inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
+      pool_shape = (1, 1) + self.pool_size + (1,)
+      strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
+      data_format = 'NCHW'
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=data_format)
+
+    if self.data_format == 'channels_last':
+      return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(Pooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
-class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
+class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
   Arguments:
@@ -45,23 +127,20 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
 
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding
-    }
-    base_config = super(MaxPooling1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling1D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
-class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
+class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
   Arguments:
@@ -78,24 +157,104 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
+    super(AveragePooling1D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
+
+
+class Pooling2D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling2D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
-                                           **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      pool_shape = (1,) + self.pool_size + (1,)
+      strides = (1,) + self.strides + (1,)
+    else:
+      pool_shape = (1, 1) + self.pool_size
+      strides = (1, 1) + self.strides
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, 4))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
   def get_config(self):
     config = {
-        'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
     }
-    base_config = super(AveragePooling1D, self).get_config()
+    base_config = super(Pooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
-class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
+class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
   Arguments:
@@ -142,26 +301,14 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling2D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
-class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
+class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
   Arguments:
@@ -208,12 +355,96 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
+    super(AveragePooling2D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
+
+
+class Pooling3D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(Pooling3D, self).__init__(name=name, **kwargs)
     if data_format is None:
-      data_format = K.image_data_format()
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def call(self, inputs):
+    pool_shape = (1,) + self.pool_size + (1,)
+    strides = (1,) + self.strides + (1,)
+
+    if self.data_format == 'channels_first':
+      # TF does not support `channels_first` with 3D pooling operations,
+      # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
+                                             self.padding, self.strides[0])
+    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
+                                             self.padding, self.strides[1])
+    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
+                                             self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
   def get_config(self):
     config = {
@@ -222,12 +453,12 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
         'strides': self.strides,
         'data_format': self.data_format
     }
-    base_config = super(AveragePooling2D, self).get_config()
+    base_config = super(Pooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
-class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
+class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -270,26 +501,14 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling3D, self).__init__(
+        nn.max_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
-class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
+class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -332,30 +551,18 @@ class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(AveragePooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(AveragePooling3D, self).__init__(
+        nn.avg_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
-class _GlobalPooling1D(Layer):
+class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
   def __init__(self, **kwargs):
-    super(_GlobalPooling1D, self).__init__(**kwargs)
+    super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
 
   def compute_output_shape(self, input_shape):
@@ -368,7 +575,7 @@ class _GlobalPooling1D(Layer):
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
-class GlobalAveragePooling1D(_GlobalPooling1D):
+class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
   Input shape:
@@ -380,11 +587,11 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.mean(inputs, axis=1)
+    return backend.mean(inputs, axis=1)
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
-class GlobalMaxPooling1D(_GlobalPooling1D):
+class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
   Input shape:
@@ -396,15 +603,15 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.max(inputs, axis=1)
+    return backend.max(inputs, axis=1)
 
 
-class _GlobalPooling2D(Layer):
+class GlobalPooling2D(Layer):
   """Abstract class for different global pooling 2D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling2D, self).__init__(**kwargs)
+    super(GlobalPooling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -420,13 +627,13 @@ class _GlobalPooling2D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling2D, self).get_config()
+    base_config = super(GlobalPooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling2D',
            'keras.layers.GlobalAvgPool2D')
-class GlobalAveragePooling2D(_GlobalPooling2D):
+class GlobalAveragePooling2D(GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
   Arguments:
@@ -456,13 +663,13 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2])
+      return backend.mean(inputs, axis=[1, 2])
     else:
-      return K.mean(inputs, axis=[2, 3])
+      return backend.mean(inputs, axis=[2, 3])
 
 
 @tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
-class GlobalMaxPooling2D(_GlobalPooling2D):
+class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
   Arguments:
@@ -492,17 +699,17 @@ class GlobalMaxPooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2])
+      return backend.max(inputs, axis=[1, 2])
     else:
-      return K.max(inputs, axis=[2, 3])
+      return backend.max(inputs, axis=[2, 3])
 
 
-class _GlobalPooling3D(Layer):
+class GlobalPooling3D(Layer):
   """Abstract class for different global pooling 3D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling3D, self).__init__(**kwargs)
+    super(GlobalPooling3D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
@@ -518,13 +725,13 @@ class _GlobalPooling3D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling3D, self).get_config()
+    base_config = super(GlobalPooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling3D',
            'keras.layers.GlobalAvgPool3D')
-class GlobalAveragePooling3D(_GlobalPooling3D):
+class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
   Arguments:
@@ -554,13 +761,13 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2, 3])
+      return backend.mean(inputs, axis=[1, 2, 3])
     else:
-      return K.mean(inputs, axis=[2, 3, 4])
+      return backend.mean(inputs, axis=[2, 3, 4])
 
 
 @tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
-class GlobalMaxPooling3D(_GlobalPooling3D):
+class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
   Arguments:
@@ -590,9 +797,9 @@ class GlobalMaxPooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2, 3])
+      return backend.max(inputs, axis=[1, 2, 3])
     else:
-      return K.max(inputs, axis=[2, 3, 4])
+      return backend.max(inputs, axis=[2, 3, 4])
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
index bb003c1dddf80e2a745c1268a3a7d045f4e8b036..2c08b647ea0fafb7519240b0c81e8fa77f034f7f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
@@ -27,14 +27,14 @@ from tensorflow.python.platform import test
 
 class GlobalPoolingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
     testing_utils.layer_test(
         keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_2d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling2D,
@@ -53,7 +53,7 @@ class GlobalPoolingTest(test.TestCase):
         kwargs={'data_format': 'channels_last'},
         input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_3d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling3D,
@@ -75,7 +75,7 @@ class GlobalPoolingTest(test.TestCase):
 
 class Pooling2DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_2d(self):
     pool_size = (3, 3)
     for strides in [(1, 1), (2, 2)]:
@@ -88,7 +88,7 @@ class Pooling2DTest(test.TestCase):
           },
           input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_2d(self):
     testing_utils.layer_test(
         keras.layers.AveragePooling2D,
@@ -122,7 +122,7 @@ class Pooling2DTest(test.TestCase):
 
 class Pooling3DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -141,7 +141,7 @@ class Pooling3DTest(test.TestCase):
         },
         input_shape=(3, 4, 11, 12, 10))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -163,7 +163,7 @@ class Pooling3DTest(test.TestCase):
 
 class Pooling1DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
@@ -173,7 +173,7 @@ class Pooling1DTest(test.TestCase):
                     'padding': padding},
             input_shape=(3, 5, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index 641b563a2594126f0bb6267cda52131ccbc766af..4c68c18825a47d87806a7a09d4054f974d569e00 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -435,8 +435,8 @@ class RNNTest(test.TestCase):
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(layer.get_updates_for(None), [update_2])
-    self.assertEqual(layer.get_updates_for(x), [update_1])
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for(x)), 1)
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index c510e464ae54b2dbe87533f02971a4af6c9c7c45..9aee5f03b6d79f0b363f79d2b7a18c0b20a2883f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -23,11 +23,11 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -213,7 +213,7 @@ class TimeDistributed(Wrapper):
         input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = tf_layers_util.object_list_uid(inputs)
+      input_uid = base_layer.object_list_uid(inputs)
       inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index 444590033042d915b12645fb0239833b666a02f7..bc8698f235aac0f5fb0c3303cc4c70aa1efa08bc 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -607,12 +607,6 @@ class CustomCallSignatureTests(test.TestCase):
     self.assertAllClose(10. * expected_output, self.evaluate(output))
     output = model(first, second=second, training=False)
     self.assertAllClose(expected_output, self.evaluate(output))
-    if not context.executing_eagerly():
-      six.assertCountEqual(self, [first, second], model.inputs)
-    with self.assertRaises(TypeError):
-      # tf.layers.Layer expects an "inputs" argument, so all-keywords doesn't
-      # work at the moment.
-      model(first=first, second=second, fiddle_with_output='yes')
 
   @test_util.run_in_graph_and_eager_modes()
   def test_inputs_in_signature(self):
@@ -622,10 +616,14 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, inputs, some_other_arg, training=False):
         return inputs
 
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
     model = HasInputsAndOtherPositional()
     with self.assertRaisesRegexp(
         TypeError, 'everything else as a keyword argument'):
-      model(array_ops.ones([]), array_ops.ones([]))
+      x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
+      model(x1, x2)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_kwargs_in_signature(self):
@@ -649,13 +647,14 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, x, *args, **kwargs):
         return [x] + list(args)
 
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
     model = HasArgs()
-    arg1 = array_ops.ones([])
-    arg2 = array_ops.ones([])
-    arg3 = array_ops.ones([])
-    model(arg1, arg2, arg3, a=3)
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    model(x1, x2, x3, a=3)
     if not context.executing_eagerly():
-      six.assertCountEqual(self, [arg1, arg2, arg3], model.inputs)
+      six.assertCountEqual(self, [x1, x2, x3], model.inputs)
 
   def test_args_and_keywords_in_signature(self):
 
@@ -666,11 +665,9 @@ class CustomCallSignatureTests(test.TestCase):
 
     with context.graph_mode():
       model = HasArgs()
-      arg1 = array_ops.ones([])
-      arg2 = array_ops.ones([])
-      arg3 = array_ops.ones([])
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
       with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
-        model(arg1, arg2, arg3, a=3)
+        model(x1, x2, x3, a=3)
 
   def test_training_no_default(self):
 
@@ -694,11 +691,9 @@ class CustomCallSignatureTests(test.TestCase):
 
     with context.graph_mode():
       model = TrainingNoDefaultWithPositional()
-      arg1 = array_ops.ones([])
-      arg2 = array_ops.ones([])
-      arg3 = array_ops.ones([])
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
       with self.assertRaisesRegexp(TypeError, 'after a non-input'):
-        model(arg1, arg2, arg3)
+        model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
index 583079d9626361eb594f16a57af86f103e5ee74d..8882a3a46bcb9de7283a67f001e67ed8644a0cf7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
@@ -21,17 +21,146 @@ from __future__ import print_function
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-# pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.layers.utils import conv_input_length
-from tensorflow.python.layers.utils import conv_output_length
-from tensorflow.python.layers.utils import deconv_output_length as deconv_length
-from tensorflow.python.layers.utils import normalize_tuple
+from tensorflow.python.keras._impl.keras import backend
+
+
+def convert_data_format(data_format, ndim):
+  if data_format == 'channels_last':
+    if ndim == 3:
+      return 'NWC'
+    elif ndim == 4:
+      return 'NHWC'
+    elif ndim == 5:
+      return 'NDHWC'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  elif data_format == 'channels_first':
+    if ndim == 3:
+      return 'NCW'
+    elif ndim == 4:
+      return 'NCHW'
+    elif ndim == 5:
+      return 'NCDHW'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single integer or iterable of integers into an integer tuple.
+
+  Arguments:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except (ValueError, TypeError):
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+    return value_tuple
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
 
 
 def normalize_data_format(value):
   if value is None:
-    value = K.image_data_format()
+    value = backend.image_data_format()
   data_format = value.lower()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('The `data_format` argument must be one of '
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index bbf1d2a3d9c3948271780ec3fad3316b4e6d53c3..f82e3277de70a631c93f0ef3c240f41ddb3390a7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import defaultdict
-import sys
 
 import numpy as np
+import six
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
   Returns:
       True if we can proceed with overwrite, False otherwise.
   """
-  get_input = input
-  if sys.version_info[:2] <= (2, 7):
-    get_input = raw_input
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                        '[y/n]' % (filepath))
-  while overwrite not in ['y', 'n']:
-    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                              '[y/n]' % (filepath)).strip().lower()
+  while overwrite not in ('y', 'n'):
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                '(cancel).').strip().lower()
   if overwrite == 'n':
     return False
   print('[TIP] Next time specify overwrite=True!')
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8da5f7777733767f31fad205a23c2f08f9ffbb1c
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if isinstance(pred, variables.Variable):
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
+
+
+def constant_value(pred):
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or a TensorFlow boolean variable
+      or tensor, or the Python integer 1 or 0.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+      integer 1 or 0.
+  """
+  # Allow integer booleans.
+  if isinstance(pred, int):
+    if pred == 1:
+      pred = True
+    elif pred == 0:
+      pred = False
+
+  if isinstance(pred, variables.Variable):
+    return None
+  return smart_module.smart_constant_value(pred)
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 6c34ea181654c29da74164f0e220b2b9ee8d939e..c277c56b8dbbbc0358d92462386ba74bfc65e174 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1029,12 +1029,14 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -1190,6 +1192,34 @@ cuda_py_test(
     shard_count = 10,
 )
 
+cuda_py_test(
+    name = "broadcast_to_ops_test",
+    size = "small",
+    srcs = ["broadcast_to_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+cuda_py_test(
+    name = "inplace_ops_test",
+    size = "small",
+    srcs = ["inplace_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    shard_count = 10,
+)
+
 cuda_py_test(
     name = "batch_matmul_op_test",
     size = "small",
@@ -1589,7 +1619,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "linalg_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["linalg_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2651,10 +2681,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
-    tags = [
-        "manual",
-        "notap",  # b/30226163
-    ],
 )
 
 cuda_py_test(
@@ -2804,7 +2830,7 @@ sycl_py_test(
 
 tf_py_test(
     name = "sets_test",
-    size = "small",
+    size = "medium",
     srcs = ["sets_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index a223241e893d6838faec9a48cb4ca9cb3c24a211..d5f0c22d6e042a28f54fea2d4505208a3f7258c0 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -36,16 +36,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
       self.assertEqual(0, stamp_token.eval())
-      (_, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (_, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
   def testCreateWithProto(self):
     with self.test_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -141,6 +143,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 6
+          last_layer_node_start: 16
+          last_layer_node_end: 19
         }
       """, ensemble_proto)
       ensemble = boosted_trees_ops.TreeEnsemble(
@@ -148,28 +152,31 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
           stamp_token=7,
           serialized_proto=ensemble_proto.SerializeToString())
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(7, stamp_token.eval())
       self.assertEqual(2, num_trees.eval())
       self.assertEqual(1, num_finalized_trees.eval())
       self.assertEqual(6, num_attempted_layers.eval())
+      self.assertAllEqual([16, 19], nodes_range.eval())
 
   def testSerializeDeserialize(self):
     with self.test_session():
       # Initialize.
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
       resources.initialize_resources(resources.shared_resources()).run()
-      (stamp_token, num_trees, num_finalized_trees,
-       num_attempted_layers) = ensemble.get_states()
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       nodes_range) = ensemble.get_states()
       self.assertEqual(5, stamp_token.eval())
       self.assertEqual(0, num_trees.eval())
       self.assertEqual(0, num_finalized_trees.eval())
       self.assertEqual(0, num_attempted_layers.eval())
+      self.assertAllEqual([0, 1], nodes_range.eval())
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -201,6 +208,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 5
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
       """, ensemble_proto)
       with ops.control_dependencies([
@@ -208,13 +217,15 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
               stamp_token=3,
               serialized_proto=ensemble_proto.SerializeToString())
       ]):
-        (stamp_token, num_trees, num_finalized_trees,
-         num_attempted_layers) = ensemble.get_states()
+        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+         nodes_range) = ensemble.get_states()
       self.assertEqual(3, stamp_token.eval())
       self.assertEqual(1, num_trees.eval())
       # This reads from metadata, not really counting the layers.
       self.assertEqual(5, num_attempted_layers.eval())
       self.assertEqual(0, num_finalized_trees.eval())
+      self.assertAllEqual([3, 7], nodes_range.eval())
+
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index a54cc43517f4513b88b94ceb9b401b84b5ca053f..4d09cf94d421cc744621680170085435b050d0a8 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -29,7 +29,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation without any regularization."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -76,7 +76,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -123,7 +123,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L1."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
@@ -173,7 +173,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
     """Testing Gain calculation with L2."""
     with self.test_session() as sess:
       max_splits = 7
-      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
       stats_summary_list = [
           [
               [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index 4226ff75c2327d09c0d89b29950605b610672603..d6c004774746dd28a7b376eb2e0564e5b71e5b40 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -132,6 +132,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -314,6 +316,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -461,6 +465,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 2
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -615,6 +621,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 5
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -624,7 +632,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test that the metadata is updated even though we can't split."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -655,6 +664,9 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
+
         }
       """, tree_ensemble_config)
 
@@ -685,7 +697,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
       # Expect no new splits created, but attempted (global) stats updated. Meta
       # data for this tree should not be updated (we didn't succeed building a
-      # layer.
+      # layer. Node ranges don't change.
       new_stamp, serialized = session.run(tree_ensemble.serialize())
       tree_ensemble = boosted_trees_pb2.TreeEnsemble()
       tree_ensemble.ParseFromString(serialized)
@@ -721,6 +733,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -730,7 +744,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.test_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -761,6 +776,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """, tree_ensemble_config)
 
@@ -851,6 +868,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -941,6 +960,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1046,6 +1067,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 2
+          last_layer_node_start: 3
+          last_layer_node_end: 7
         }
        """
       self.assertEqual(new_stamp, 2)
@@ -1179,6 +1202,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 3
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
        """
       self.assertEqual(new_stamp, 3)
@@ -1268,6 +1293,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 1
+          last_layer_node_end: 3
         }
       """
       self.assertEqual(new_stamp, 1)
@@ -1307,7 +1334,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       # Expect the ensemble to be empty as post-pruning will prune
       # the entire finalized tree.
       self.assertEqual(new_stamp, 2)
-      self.assertProtoEquals("""
+      self.assertProtoEquals(
+          """
       trees {
         nodes {
           leaf {
@@ -1359,6 +1387,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       growing_metadata {
         num_trees_attempted: 1
         num_layers_attempted: 2
+        last_layer_node_start: 0
+        last_layer_node_end: 1
       }
       """, res_ensemble)
 
@@ -1455,6 +1485,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
         growing_metadata {
           num_trees_attempted: 1
           num_layers_attempted: 1
+          last_layer_node_start: 0
+          last_layer_node_end: 1
         }
       """
       self.assertEqual(new_stamp, 1)
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a1bd958ba89080ff38e461646b07edbc6daec21
--- /dev/null
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -0,0 +1,85 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for broadcast_to ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class BroadcastToTest(test_util.TensorFlowTestCase):
+
+  def testBroadcastToBasic(self):
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3], dtype=dtype)
+        v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToString(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([b"1", b"2", b"3"])
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToBool(self):
+    with self.test_session(use_gpu=True):
+      x = np.array([True, False, True], dtype=np.bool)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShape(self):
+    for input_dim in range(1, 6):
+      for output_dim in range(input_dim, 6):
+        with self.test_session(use_gpu=True):
+          input_shape = [2] * input_dim
+          output_shape = [2] * output_dim
+          x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+          v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+          v_np = np.broadcast_to(x, output_shape)
+          self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToScalar(self):
+    with self.test_session(use_gpu=True):
+      x = np.array(1, dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  def testBroadcastToShapeTypeAndInference(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = np.array([1, 2, 3])
+        v_tf = array_ops.broadcast_to(
+            constant_op.constant(x),
+            constant_op.constant([3, 3], dtype=dtype))
+        shape = v_tf.get_shape().as_list()
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+        # check shape inference when shape input is constant
+        self.assertAllEqual(shape, v_np.shape)
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 5c8b71da174b8c38a797f8bf97c432d732d9978f..cb1359be159f5d8983f149cf42b2723dc0581ea8 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,16 +19,34 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 
 
 class ClipTest(test.TestCase):
 
+  def testClipByValueGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
+    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
+    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
+    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
+    with self.test_session():
+      error_1 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_1, [4])
+      self.assertLess(error_1, 1e-4)
+
+      error_2 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_2, [4])
+      self.assertLess(error_2, 1e-4)
+
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
@@ -37,18 +55,84 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  # [Tensor, Scalar, Scalar]
+  def testClipByValue0Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = 2
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Scalar]
+  def testClipByValue1Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = constant_op.constant([2, 2, 2, 3, 3, 3], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Scalar, Tensor]
+  def testClipByValue2Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[4, 4, 4], [4, 5, 6]]
+        clip_value_min = 4
+        clip_value_max = constant_op.constant([6, 6, 6, 6, 6, 6], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Tensor]
+  def testClipByValue3Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [5, 5, 6]]
+        clip_value_min = constant_op.constant([2, 2, 2, 5, 5, 5], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = constant_op.constant([5, 5, 5, 7, 7, 7], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
   def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, -clip, clip)
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, 1.0, clip)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, -clip, clip)
+        tf_ans = ans.eval()
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, 1.0, clip)
+        tf_ans = ans.eval()
 
   def testClipByValueNonFinite(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
@@ -60,7 +144,7 @@ class ClipTest(test.TestCase):
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@@ -76,7 +160,7 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans_tensor)
 
   def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -85,7 +169,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -97,7 +181,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@@ -109,7 +193,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@@ -121,7 +205,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@@ -133,7 +217,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@@ -146,7 +230,7 @@ class ClipTest(test.TestCase):
   # ClipByGlobalNorm tests
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -167,7 +251,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -188,7 +272,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -211,7 +295,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -244,7 +328,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -263,7 +347,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -282,7 +366,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -294,7 +378,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -306,7 +390,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -318,7 +402,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 18796f709566f022258806ce46cc706e8fe34354..107ee37fabbae56c5bf715e1e7953b62ac3c526b 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -65,6 +65,11 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testInvalidDType(self):
+    # Test case for GitHub issue 18474
+    with self.assertRaises(TypeError):
+      constant_op.constant(dtypes_lib.string, "[,]")
+
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
@@ -653,12 +658,12 @@ class FillTest(test.TestCase):
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex64(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex64)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex64)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex128(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex128)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 75f8644f694c4cebb7dbdac4599244dda427bc05..e27eb00818a9f4b8dd8b8c9caf5bd48ee7400928 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -664,6 +664,23 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(42.0, grad.eval(feed_dict={c: 1}))
       self.assertAllEqual(3.0, grad.eval(feed_dict={c: 3}))
 
+  def testCondGrad_3(self):
+    with self.test_session():
+      c = array_ops.placeholder(dtypes.int32, shape=[])
+      ox = constant_op.constant(10.0)
+      pred = math_ops.less(c, 2)
+
+      def fn1(x):
+        m = x * x
+        return gradients_impl.gradients(m, [ox])[0]
+
+      fn2 = lambda: math_ops.multiply(ox, 3.0)
+      y = math_ops.multiply(7.0, ox)
+      r = control_flow_ops.cond(pred, lambda: fn1(y), fn2)
+
+      self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
+      self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
+
   def testNestedCond_Simple(self):
     with self.test_session():
       x = constant_op.constant(0., name="X")
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index fec52fa9cc7bcab1da67e797c2e121edac8c9345..4f49d726765e6019715a9b40f531b82df7f33126 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -78,9 +78,11 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(args, expected_out)
 
   def test2DNoQuoteDelimiter(self):
-    args = {"records": [["1", "2"], ['""', '"']],
-            "record_defaults": [[""]],
-            "use_quote_delim": False}
+    args = {
+        "records": [["1", "2"], ['""', '"']],
+        "record_defaults": [[""]],
+        "use_quote_delim": False
+    }
     expected_out = [[[b"1", b"2"], [b'""', b'"']]]
 
     self._test(args, expected_out)
@@ -88,8 +90,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testDouble(self):
     args = {
         "records": ["1.0", "-1.79e+308", '"1.79e+308"'],
-        "record_defaults": [np.array(
-            [], dtype=np.double)],
+        "record_defaults": [np.array([], dtype=np.double)],
     }
 
     expected_out = [[1.0, -1.79e+308, 1.79e+308]]
@@ -99,8 +100,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testInt64(self):
     args = {
         "records": ["1", "2", '"2147483648"'],
-        "record_defaults": [np.array(
-            [], dtype=np.int64)],
+        "record_defaults": [np.array([], dtype=np.int64)],
     }
 
     expected_out = [[1, 2, 2147483648]]
@@ -173,8 +173,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWithoutDefaultsError(self):
     args = {
         "records": [",1", "0.2,3", "3.0,"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -183,8 +182,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldIntError(self):
     args = {
         "records": [",1", "0.2,234a", "3.0,2"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -202,8 +200,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldFloatError(self):
     args = {
         "records": [",1", "0.2,2", "3.0adf,3"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -229,6 +226,73 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(
         args, expected_err_re="Quoted field has to end with quote followed.*")
 
+  def testSelectCols(self):
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[1], [2]],
+        "select_cols": [0, 1]
+    }
+    expected_out = [[1, 4], [2, 5]]
+    self._test(args, expected_out)
+
+  def testSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 2]
+    }
+    expected_out = [[0, 4], [1, 5], [2, 6]]
+    self._test(args, expected_out)
+
+  def testWrongSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 3]
+    }
+    self._test(args, expected_err_re="Expect 3 fields but have 2 in record 0")
+
+  def testWrongSelectColsLen(self):
+    args = {
+        "records": ["1,2,3", "4,5,6"],
+        "record_defaults": [[0], [0], [0]],
+        "select_cols": [0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "Length of select_cols and record_defaults do not match."):
+      self._test(args)
+
+  def testWrongSelectColsSorting(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [1, 0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols is not strictly increasing."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesNegative(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [-1, 0]  # -1 is not a valid index
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols contains negative values."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesTooHigh(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [0, 3]  # 3 is not a valid index
+    }
+    # Only successfully parses one of the columns
+    self._test(args, expected_err_re="Expect 2 fields but have 1 in record 0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 9f9fb5c0bb4c0e9d68ddf6034a8649ad5a6bd8e9..18582241e2fb69dffc0b66aa361aa77fbb97944f 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import abc
 
+import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
@@ -43,11 +44,10 @@ class BaseBijectorTest(test.TestCase):
       """Minimal specification of a `Bijector`."""
 
       def __init__(self):
-        super(_BareBonesBijector, self).__init__()
+        super(_BareBonesBijector, self).__init__(forward_min_event_ndims=0)
 
     with self.test_session() as sess:
       bij = _BareBonesBijector()
-      self.assertEqual(None, bij.event_ndims)
       self.assertEqual([], bij.graph_parents)
       self.assertEqual(False, bij.is_constant_jacobian)
       self.assertEqual(False, bij.validate_args)
@@ -67,13 +67,21 @@ class BaseBijectorTest(test.TestCase):
         self.assertAllEqual(shape, inverse_event_shape_)
         self.assertAllEqual(shape, bij.inverse_event_shape(shape))
 
-      for fn in ["forward",
-                 "inverse",
-                 "inverse_log_det_jacobian",
-                 "forward_log_det_jacobian"]:
-        with self.assertRaisesRegexp(
-            NotImplementedError, fn + " not implemented"):
-          getattr(bij, fn)(0)
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse not implemented"):
+        bij.inverse(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward not implemented"):
+        bij.forward(0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "inverse_log_det_jacobian not implemented"):
+        bij.inverse_log_det_jacobian(0, event_ndims=0)
+
+      with self.assertRaisesRegexp(
+          NotImplementedError, "forward_log_det_jacobian not implemented"):
+        bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
 class IntentionallyMissingError(Exception):
@@ -85,7 +93,7 @@ class BrokenBijector(bijector.Bijector):
 
   def __init__(self, forward_missing=False, inverse_missing=False):
     super(BrokenBijector, self).__init__(
-        event_ndims=0, validate_args=False, name="broken")
+        validate_args=False, forward_min_event_ndims=0, name="broken")
     self._forward_missing = forward_missing
     self._inverse_missing = inverse_missing
 
@@ -120,35 +128,42 @@ class BijectorCachingTestBase(object):
 
   def testCachingOfForwardResults(self):
     broken_bijector = self.broken_bijector_cls(inverse_missing=True)
-    with self.test_session():
-      x = constant_op.constant(1.1)
+    x = constant_op.constant(1.1)
+
+    # Call forward and forward_log_det_jacobian one-by-one (not together).
+    y = broken_bijector.forward(x)
+    _ = broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
 
-      # Call forward and forward_log_det_jacobian one-by-one (not together).
-      y = broken_bijector.forward(x)
-      _ = broken_bijector.forward_log_det_jacobian(x)
+    # Now, everything should be cached if the argument is y.
+    broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    try:
+      broken_bijector.inverse(y)
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
 
-      # Now, everything should be cached if the argument is y.
-      try:
-        broken_bijector.inverse(y)
-        broken_bijector.inverse_log_det_jacobian(y)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.inverse_log_det_jacobian(y, event_ndims=1)
 
   def testCachingOfInverseResults(self):
     broken_bijector = self.broken_bijector_cls(forward_missing=True)
-    with self.test_session():
-      y = constant_op.constant(1.1)
+    y = constant_op.constant(1.1)
 
-      # Call inverse and inverse_log_det_jacobian one-by-one (not together).
-      x = broken_bijector.inverse(y)
-      _ = broken_bijector.inverse_log_det_jacobian(y)
+    # Call inverse and inverse_log_det_jacobian one-by-one (not together).
+    x = broken_bijector.inverse(y)
+    _ = broken_bijector.inverse_log_det_jacobian(y, event_ndims=0)
 
-      # Now, everything should be cached if the argument is x.
-      try:
-        broken_bijector.forward(x)
-        broken_bijector.forward_log_det_jacobian(x)
-      except IntentionallyMissingError:
-        raise AssertionError("Tests failed! Cached values not used.")
+    # Now, everything should be cached if the argument is x.
+    try:
+      broken_bijector.forward(x)
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=0)
+    except IntentionallyMissingError:
+      raise AssertionError("Tests failed! Cached values not used.")
+
+    # Different event_ndims should not be cached.
+    with self.assertRaises(IntentionallyMissingError):
+      broken_bijector.forward_log_det_jacobian(x, event_ndims=1)
 
 
 class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
@@ -159,5 +174,107 @@ class BijectorCachingTest(BijectorCachingTestBase, test.TestCase):
     return BrokenBijector
 
 
+class ExpOnlyJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ExpOnlyJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=False,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="exp")
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y)
+
+  def _forward_log_det_jacobian(self, x):
+    return math_ops.log(x)
+
+
+class ConstantJacobian(bijector.Bijector):
+  """Only used for jacobian calculations."""
+
+  def __init__(self, forward_min_event_ndims=0):
+    super(ConstantJacobian, self).__init__(
+        validate_args=False,
+        is_constant_jacobian=True,
+        forward_min_event_ndims=forward_min_event_ndims,
+        name="c")
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(2., y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(-2., x.dtype)
+
+
+class BijectorReduceEventDimsTest(test.TestCase):
+  """Test caching with BrokenBijector."""
+
+  def testReduceEventNdimsForward(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        np.log(x),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=-1),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsForwardRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.forward_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsInverse(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian()
+    self.assertAllClose(
+        -np.log(x),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=-1),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        np.sum(-np.log(x), axis=(-1, -2)),
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseRaiseError(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    with self.assertRaisesRegexp(ValueError, "must be larger than"):
+      bij.inverse_log_det_jacobian(x, event_ndims=0)
+
+  def testReduceEventNdimsForwardConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        -2.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        -4.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        -8.,
+        self.evaluate(bij.forward_log_det_jacobian(x, event_ndims=2)))
+
+  def testReduceEventNdimsInverseConstJacobian(self):
+    x = [[[1., 2.], [3., 4.]]]
+    bij = ConstantJacobian()
+    self.assertAllClose(
+        2.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=0)))
+    self.assertAllClose(
+        4.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=1)))
+    self.assertAllClose(
+        8.,
+        self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e8f9d0b728d8f831becc82cdba0ae2bf3d5da52a..b347c20db25df6dc0f278d9b34b4588277104850 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -27,14 +27,19 @@ class IdentityBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = X transformation."""
 
   def testBijector(self):
-    with self.test_session():
-      bijector = identity_bijector.Identity()
-      self.assertEqual("identity", bijector.name)
-      x = [[[0.], [1.]]]
-      self.assertAllEqual(x, bijector.forward(x).eval())
-      self.assertAllEqual(x, bijector.inverse(x).eval())
-      self.assertAllEqual(0., bijector.inverse_log_det_jacobian(x).eval())
-      self.assertAllEqual(0., bijector.forward_log_det_jacobian(x).eval())
+    bijector = identity_bijector.Identity(validate_args=True)
+    self.assertEqual("identity", bijector.name)
+    x = [[[0.], [1.]]]
+    self.assertAllEqual(x, self.evaluate(bijector.forward(x)))
+    self.assertAllEqual(x, self.evaluate(bijector.inverse(x)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.inverse_log_det_jacobian(x, event_ndims=3)))
+    self.assertAllEqual(
+        0.,
+        self.evaluate(
+            bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
   def testScalarCongruency(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 1e5c118cbc3573af0a2ce95239f499a5e52a0c86..f7a7119b3446ef875bb12dd9c2049fec74f886d8 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -551,7 +551,6 @@ class OrthogonalInitializerTest(test.TestCase):
       init2 = init_ops.orthogonal_initializer(gain=3.14, seed=1, dtype=dtype)
       with self.test_session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
-      with self.test_session(graph=ops.Graph(), use_gpu=True):
         t2 = init2(shape).eval()
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
@@ -610,7 +609,6 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
                                                       seed=1, dtype=dtype)
       with self.test_session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
-      with self.test_session(graph=ops.Graph(), use_gpu=True):
         t2 = init2(shape).eval()
       return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
 
@@ -674,6 +672,103 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
 
+class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
+
+  def testInitializerIdentical(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testInitializerDifferent(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
+      self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
+
+  def testDuplicatedInitializer(self):
+    init = init_ops.convolutional_orthogonal_2d()
+    self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
+
+  def testInvalidDataType(self):
+    self.assertRaises(
+        ValueError, init_ops.convolutional_orthogonal_2d,
+        dtype=dtypes.string)
+
+  def testInvalidShape(self):
+    init1 = init_ops.convolutional_orthogonal_2d()
+    with self.test_session(graph=ops.Graph(), use_gpu=True):
+      self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
+
+  def testGain(self):
+    shape = (3, 3, 10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
+      init2 = init_ops.convolutional_orthogonal_2d(gain=3.14,
+                                                   seed=1, dtype=dtype)
+      with self.test_session(graph=ops.Graph(), use_gpu=True):
+        t1 = init1(shape).eval()
+        t2 = init2(shape).eval()
+      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+
+  def testShapesValues(self):
+    def circular_pad(input_, width, kernel_size):
+      """Pad input_ for computing (circular) convolution.
+
+      Args:
+        input_: the input tensor
+        width: the width of the tensor.
+        kernel_size: the kernel size of the filter.
+      Returns:
+        a tensor whose width is (width + kernel_size - 1).
+      """
+      beg = kernel_size // 2
+      end = kernel_size - 1 - beg
+
+      tmp_up = array_ops.slice(input_, [0, width - beg, 0, 0],
+                               [-1, beg, width, -1])
+      tmp_down = array_ops.slice(input_, [0, 0, 0, 0], [-1, end, width, -1])
+      tmp = array_ops.concat([tmp_up, input_, tmp_down], 1)
+
+      new_width = width + kernel_size - 1
+      tmp_left = array_ops.slice(tmp, [0, 0, width - beg, 0],
+                                 [-1, new_width, beg, -1])
+      tmp_right = array_ops.slice(tmp, [0, 0, 0, 0], [-1, new_width, end, -1])
+
+      final = array_ops.concat([tmp_left, tmp, tmp_right], 2)
+      return final
+
+    cout = 45
+    shape = [64, 28, 28, 32]
+    outputs_shape = shape[0:-1] + [cout]
+    dtype = dtypes.float32
+    tol = 1e-3
+    gain = 3.14
+    # Check orthogonality/isometry by computing the ratio between
+    # the 2-norms of the inputs and ouputs.
+    for kernel_size in [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]:
+      convolution = convolutional.conv2d
+      inputs = random_ops.random_normal(shape, dtype=dtype)
+      inputs_2norm = linalg_ops.norm(inputs)
+      input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0])
+      outputs = convolution(
+          input_with_circular_pad, padding="valid", filters=cout,
+          kernel_size=kernel_size, use_bias=False,
+          kernel_initializer=init_ops.convolutional_orthogonal_2d(gain=gain))
+      outputs_2norm = linalg_ops.norm(outputs)
+      my_ops = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True) as sess:
+        sess.run(my_ops)
+        # Check the shape of the outputs
+        t = outputs.eval()
+        self.assertAllEqual(t.shape, outputs_shape)
+        # Check isometry of the orthogonal kernel.
+        self.assertAllClose(
+            sess.run(inputs_2norm)/np.sqrt(np.prod(shape)),
+            sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(gain)),
+            rtol=tol, atol=tol)
+
+
 class IdentityInitializerTest(test.TestCase):
 
   def testInvalidDataType(self):
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f95e13187fcd5cc199d871ea5efdca363b37cd0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for inplace_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class InplaceOpsTest(test_util.TensorFlowTestCase):
+
+  def testBasicUpdate(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] = 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [-1],
+                                       array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] = 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] = 7
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicUpdateBool(self):
+    with self.test_session(use_gpu=True):
+      x = array_ops.ones([7, 3], dtypes.bool)
+      y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3],
+                                                            dtypes.bool))
+      y[3, :] = True
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [-1],
+                                     array_ops.zeros([1, 3], dtypes.bool))
+      y[-1, :] = False
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, 5, array_ops.zeros([3], dtypes.bool))
+      y[5, :] = False
+      self.assertAllClose(x.eval(), y)
+
+  def testBasicAdd(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = array_ops.inplace_add(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] += 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] += 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] += 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] += 99
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicSub(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] -= 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] -= 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] -= 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] -= 99
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom(self):
+    with self.test_session(use_gpu=True):
+      d0, d1, d2 = 100, 3, 5
+      x = array_ops.zeros([d0, d1, d2])
+      y = np.zeros([d0, d1, d2])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10, d1, d2))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx, :] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx, :] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx, :] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom1D(self):
+    with self.test_session(use_gpu=True):
+      d0 = 100
+      x = array_ops.zeros([d0])
+      y = np.zeros([d0])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testAlias(self):
+    with self.test_session(use_gpu=True) as sess:
+      x = array_ops.ones([2, 3])
+      y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
+      with ops.control_dependencies([y]):
+        z = array_ops.identity(x)
+        _, vy, vz = sess.run([x, y, z])
+      self.assertAllClose(vy, vz)
+
+  def testError(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a vector"):
+        _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "x and v shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "i and x shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
+
+  def testEmpty(self):
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+    ]:
+      with self.test_session(use_gpu=True):
+        test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
+        for shape in test_shapes:
+          val = inplace_ops.empty(shape, dtype).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty(shape, dtype, init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+          val = inplace_ops.empty_like(array_ops.zeros(shape, dtype)).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty_like(
+              array_ops.zeros(shape, dtype), init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index dbbed39c727f01ed1fae271375575c690958c7d8..20845997605b2a001abbf0366b91db958087c1ff 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -33,6 +33,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -43,71 +45,83 @@ def scalar_shape():
 
 class ListOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPop(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testPushPop()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStack(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [1.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testStack()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 2.0)
+    self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
-    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
+    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
-    self.assertAllEqual(e0, 1.0)
+    self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [3.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnknownShape(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=-1)
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=-1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
-    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
@@ -116,15 +130,16 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
-          list_ops.tensor_list_pop_back(
-              l_gpu, element_dtype=dtypes.float32)[1],
-          2.0)
+          self.evaluate(
+              list_ops.tensor_list_pop_back(
+                  l_gpu, element_dtype=dtypes.float32)[1]), 2.0)
     l_cpu = array_ops.identity(l_gpu)
     self.assertAllEqual(
-        list_ops.tensor_list_pop_back(
-            l_cpu, element_dtype=dtypes.float32)[1],
-        2.0)
+        self.evaluate(
+            list_ops.tensor_list_pop_back(
+                l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStack(self):
     with context.graph_mode(), self.test_session():
       tl = list_ops.empty_tensor_list(
@@ -132,9 +147,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           element_dtype=dtypes.int32)
       tl = list_ops.tensor_list_push_back(tl, [1])
       self.assertAllEqual(
-          list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32).eval(),
+          self.evaluate(
+              list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoop(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -149,9 +166,10 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, t1 = control_flow_ops.while_loop(lambda i, t1: math_ops.less(i, 4),
                                           body, [i, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32).eval()
-      self.assertAllEqual(s1, [0, 1, 2, 3])
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
+      self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       list_ = list_ops.empty_tensor_list(
@@ -169,11 +187,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       for _ in range(2):
         list_, m = body(list_, m)
 
-      s1 = list_ops.tensor_list_stack(
-          list_, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(list_, element_dtype=dtypes.float32)
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoopSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -193,10 +211,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, m, t1 = control_flow_ops.while_loop(
           lambda i, m, t1: math_ops.less(i, 4), body, [i, m, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32)
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSerialize(self):
     # pylint: disable=g-import-not-at-top
     try:
@@ -226,8 +245,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               l_ps, element_dtype=dtypes.float32)
         with ops.device("/job:worker"):
           worker_e = array_ops.identity(e)
-        self.assertAllEqual(worker_e.eval(), [2.0])
+        self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
@@ -237,18 +257,21 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       l = list_ops.tensor_list_push_back(l, c)
       l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       e = 2 * e
-    self.assertAllEqual(tape.gradient(e, [c])[0], 2.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
       l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
       c2 = list_ops.tensor_list_stack(
-          l, element_dtype=dtypes.float32)
+          l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
-    self.assertAllEqual(tape.gradient(result, [c])[0], [2.0, 2.0])
+    grad = tape.gradient(result, [c])[0]
+    self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -261,16 +284,82 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       ee = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       y = e * e + ee * ee
     grad_c, grad_c2 = tape.gradient(y, [c, c2])
-    self.assertAllEqual(grad_c, [0.0, 4.0])
-    self.assertAllEqual(grad_c2, 6.0)
+    self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
+    self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
     with self.assertRaises(errors.InvalidArgumentError):
-      list_ops.tensor_list_set_item(l, 20, 3.0)
+      self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testResourceVariableScatterGather(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
+    v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
+    self.evaluate(v.initializer)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_0_stacked))
+    v_r_sparse_stacked = list_ops.tensor_list_stack(
+        v.sparse_read(0), dtypes.float32)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
+    l_new_0 = list_ops.tensor_list_from_tensor(
+        [3.0, 4.0], element_shape=scalar_shape())
+    l_new_1 = list_ops.tensor_list_from_tensor(
+        [5.0, 6.0], element_shape=scalar_shape())
+    updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
+    updated_v_elems = array_ops.unstack(updated_v)
+    updated_v_stacked = [
+        list_ops.tensor_list_stack(el, dtypes.float32) for el in updated_v_elems
+    ]
+    expected = ([[1.0, 2.0]] * 3 + [[3.0, 4.0], [1.0, 2.0], [5.0, 6.0]] +
+                [[1.0, 2.0]] * 4)
+    self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testPushBackBatch(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l_batch = array_ops.stack([l0, l1])
+    l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
+    l_unstack = array_ops.unstack(l_push)
+    l0_ret = list_ops.tensor_list_stack(l_unstack[0], dtypes.float32)
+    l1_ret = list_ops.tensor_list_stack(l_unstack[1], dtypes.float32)
+    self.assertAllClose([1.0, 2.0, 3.0], self.evaluate(l0_ret))
+    self.assertAllClose([-1.0, 4.0], self.evaluate(l1_ret))
+
+    with ops.control_dependencies([l_push]):
+      l_unstack_orig = array_ops.unstack(l_batch)
+      l0_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[0],
+                                               dtypes.float32)
+      l1_orig_ret = list_ops.tensor_list_stack(l_unstack_orig[1],
+                                               dtypes.float32)
+
+    # Check that without aliasing, push_back_batch still works; and
+    # that it doesn't modify the input.
+    l0_r_v, l1_r_v, l0_orig_v, l1_orig_v = self.evaluate(
+        (l0_ret, l1_ret, l0_orig_ret, l1_orig_ret))
+    self.assertAllClose([1.0, 2.0, 3.0], l0_r_v)
+    self.assertAllClose([-1.0, 4.0], l1_r_v)
+    self.assertAllClose([1.0, 2.0], l0_orig_v)
+    self.assertAllClose([-1.0], l1_orig_v)
+
+    # Pushing back mismatched shapes fails.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, []))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "incompatible shape to a list at index 0"):
+      self.evaluate(
+          list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Invalid data type at index 0"):
+      self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b8200ac0cb1e4315a56181779c70da1126d8fc15..7948a475bbaad5978368f1d68372174e4b7a8ab7 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -88,6 +88,16 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  def testNegativeAxis(self):
+    self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
+    self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
+    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
+                       3, -10).eval()
+
   def testRollInputMustVectorHigherRaises(self):
     tensor = 7
     shift = 1
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index a9dc7b7de000024f23b88406bf0c1c2f32ac4fac..051c7d86bf2342f15b587fc350bfbede7fae2285 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -46,7 +46,7 @@ def composed_sampler(logits, num_samples):
   logits = array_ops.expand_dims(logits, -1)
 
   # [batch size, num samples]
-  return math_ops.argmax(logits + noise, dimension=1)
+  return math_ops.argmax(logits + noise, axis=1)
 
 
 native_sampler = random_ops.multinomial
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index df37dd98ece57ae7c3835ab63b720b29fc19c975..e4b5c3832a2252aedc8820a650b022cd30b7f285 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=1, maxval=[2, 3], dtype=dtype)
+
   # Check that uniform ints actually follow a uniform distribution.
   def testUniformInts(self):
     minv = -2
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index edc63264a3549e91f2d6278a935be29eda5c99be..984192258c9724dd9d73105c65177786def98e83 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,6 +36,9 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+from tensorflow.python.training import saver
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
@@ -174,73 +177,59 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32))
     self.assertEqual(read, 2)
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAdd(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(resource_variable_ops.assign_variable_op(
-          handle, constant_op.constant([[1]], dtype=dtypes.int32)))
-      self.evaluate(resource_variable_ops.resource_scatter_add(
-          handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSub(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         [[2]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMul(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         [[5]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDiv(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMin(self):
     with ops.device("cpu:0"):
       handle = resource_variable_ops.var_handle_op(
@@ -258,131 +247,115 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
       self.assertEqual(self.evaluate(read), [[3]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testMetagraph(self):
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope("foo", use_resource=True):
+        a = variable_scope.get_variable("a", initializer=10.0)
+
+      momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.1).minimize(
+              a,
+              colocate_gradients_with_ops=True,
+              global_step=training_util.get_or_create_global_step())
+
+      graph = ops.get_default_graph()
+      meta_graph_def = saver.export_meta_graph(graph=graph)
+
+    with ops.Graph().as_default():
+      saver.import_meta_graph(meta_graph_def, import_scope="")
+      meta_graph_two = saver.export_meta_graph(graph=graph)
+    self.assertEqual(meta_graph_def, meta_graph_two)
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMax(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAddScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_add(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSubScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMulScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         5,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDivScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMinScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_min(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMaxScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 86ab9fbb70b5efcf06cc064617df14deb18c1f98..51aa671098905e840b7c96cd5a984887d347adf9 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -24,11 +24,13 @@ import threading
 import numpy
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -118,6 +120,16 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
+  def testGetVariableInGraphNestedUnderEagerContext(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        v = variable_scope.get_variable("should_be_resource", [])
+        self.assertEqual(type(v), resource_variable_ops.ResourceVariable)
+
+      f()
+
   def testEagerVariableStore(self):
     with context.eager_mode():
       store = variable_scope.EagerVariableStore()
@@ -156,6 +168,28 @@ class VariableScopeTest(test.TestCase):
       for v in new_store.variables():
         self.assertEqual(v.numpy(), 1)
 
+  def testEagerVariableStoreWithEagerDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        x = constant_op.constant([[2.0]])
+        d1 = core_layers.Dense(
+            1, name="my_dense", kernel_initializer=init_ops.ones_initializer())
+        _ = d1(x)  # create variables
+        self.assertEqual(len(d1.variables), 2)
+        v1, v2 = d1.variables
+        d2 = core_layers.Dense(
+            1,
+            name="my_dense",
+            kernel_initializer=init_ops.ones_initializer(),
+            _reuse=True)
+        _ = d2(x)
+        self.assertEqual(len(d2.variables), 2)
+        v3, v4 = d2.variables
+        self.assertAllEqual([v1, v2], [v3, v4])
+      f()
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -209,15 +243,15 @@ class VariableScopeTest(test.TestCase):
 
           with variable_scope.variable_scope("not_cached", caching_device=""):
             v2_not_cached = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_not_cached.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_not_cached.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope(
               "not_cached_identity_device",
               caching_device=lambda op: op.device):
             v2_identity_device = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_identity_device.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_identity_device.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope("we_will_do_it_live") as vs_live:
             vs_live.set_caching_device("/job:live")
@@ -484,15 +518,19 @@ class VariableScopeTest(test.TestCase):
 
   def testVarScopeGetOrCreateReuse(self):
     with self.test_session():
+
       def test_value(value):
         x = constant_op.constant(value)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = state_ops.assign(variable_scope.get_variable("var", []), x)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
         self.assertEqual(value, x.eval())
+
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
       test_value(17.)
@@ -551,19 +589,16 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("default") as default:
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer/w:0")
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_1/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_1/w:0")
         with variable_scope.variable_scope(default):
           pass
         # No matter the jump in the middle, unique numbering continues.
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_2/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_2/w:0")
 
   def testVarOpScopeReuse(self):
     with self.test_session():
@@ -935,12 +970,12 @@ class VariableScopeTest(test.TestCase):
   def testGetCollection(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
-      _ = variable_scope.get_variable("testGetCollection_b", [],
-                                      trainable=False)
+      _ = variable_scope.get_variable(
+          "testGetCollection_b", [], trainable=False)
       with variable_scope.variable_scope("testGetCollection_foo_") as scope1:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -954,8 +989,8 @@ class VariableScopeTest(test.TestCase):
         ])
       with variable_scope.variable_scope("testGetCollection_foo") as scope2:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -992,22 +1027,22 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope(
           "testGetTrainableVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetTrainableVariables_b", [])
-        _ = variable_scope.get_variable("testGetTrainableVariables_c", [],
-                                        trainable=False)
-        self.assertEqual([v.name
-                          for v in scope.trainable_variables()],
-                         ["testGetTrainableVariables_foo/"
-                          "testGetTrainableVariables_b:0"])
+        _ = variable_scope.get_variable(
+            "testGetTrainableVariables_c", [], trainable=False)
+        self.assertEqual(
+            [v.name for v in scope.trainable_variables()],
+            ["testGetTrainableVariables_foo/"
+             "testGetTrainableVariables_b:0"])
 
   def testGetGlobalVariables(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
       with variable_scope.variable_scope("testGetGlobalVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetGlobalVariables_b", [])
-        self.assertEqual([v.name
-                          for v in scope.global_variables()],
-                         ["testGetGlobalVariables_foo/"
-                          "testGetGlobalVariables_b:0"])
+        self.assertEqual(
+            [v.name for v in scope.global_variables()],
+            ["testGetGlobalVariables_foo/"
+             "testGetGlobalVariables_b:0"])
 
   def testGetLocalVariables(self):
     with self.test_session():
@@ -1016,10 +1051,8 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("foo") as scope:
         _ = variable_scope.get_variable(
             "b", [], collections=[ops.GraphKeys.LOCAL_VARIABLES])
-        _ = variable_scope.get_variable(
-            "c", [])
-        self.assertEqual([v.name
-                          for v in scope.local_variables()], ["foo/b:0"])
+        _ = variable_scope.get_variable("c", [])
+        self.assertEqual([v.name for v in scope.local_variables()], ["foo/b:0"])
 
   def testGetVariableWithRefDtype(self):
     v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
@@ -1242,10 +1275,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with ops.name_scope("prod_getter"):
         return g_0 * g_1
 
-    with variable_scope.variable_scope(
-        "prod_scope", custom_getter=prod_getter):
-      with variable_scope.variable_scope(
-          "sum_scope", custom_getter=sum_getter):
+    with variable_scope.variable_scope("prod_scope", custom_getter=prod_getter):
+      with variable_scope.variable_scope("sum_scope", custom_getter=sum_getter):
         with variable_scope.variable_scope(
             "inner_sum_scope", custom_getter=sum_getter):
           # take sums of sums of products
@@ -1270,9 +1301,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       np_vars, np_v = sess.run([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
-          np_v,
-          (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
-           + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
+          np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
+              (np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
   def testVariableCreator(self):
 
@@ -1368,7 +1398,11 @@ class VariableScopeMultithreadedTest(test.TestCase):
 
     graph = ops.get_default_graph()
     threads = [
-        threading.Thread(target=thread_fn, args=(i, graph,)) for i in range(2)]
+        threading.Thread(target=thread_fn, args=(
+            i,
+            graph,
+        )) for i in range(2)
+    ]
 
     threads[0].start()
     # Allow thread 0 to finish before starting thread 1.
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index ec741d3265b4216bd962280b0b927d6ad8a51fe4..64db49c900c21d60ba2337f920d6fa2cb9ab7b5f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -12,148 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import copy
-import re
-import weakref
 
-import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import utils as layers_util
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Layer')
-class Layer(checkpointable.CheckpointableBase):
-  """Base layer class.
+InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-  This is the class from which all layers inherit, implementing common
-  infrastructure functionality.
 
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing variables,
-  losses, and updates, as well as applying TensorFlow ops to input tensors.
+@tf_export('layers.Layer')
+class Layer(base_layer.Layer):
+  """Base layer class.
 
-  Users will just instantiate it and then treat it as a callable.
+  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+  instead.
 
-  We recommend that descendants of Layer implement the following methods:
-  * `__init__()`: Save configuration in member variables
-  * `build()`: Called once from `__call__`, when we know the shapes of inputs
-    and `dtype`. Should have the calls to `add_variable()`, and then
-    call the super's `build()` (which sets `self.built = True`, which is
-    nice in case the user wants to call `build()` manually before the
-    first `__call__`).
-  * `call()`: Called in `__call__` after making sure `build()` has been called
-    once. Should actually perform the logic of applying the layer to the
-    input tensors (which should be passed in as the first argument).
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
 
   Read-only properties:
-    `name`: The name of the layer (string).
-    `dtype`: Default dtype of the layer (default of `None` means use the
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    `trainable_variables`: List of trainable variables.
-    `non_trainable_variables`: List of non-trainable variables.
-    `variables`: List of all variables of this layer, trainable and
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
       non-trainable.
-    `updates`: List of update ops of this layer.
-    `losses`: List of losses added by this layer.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
 
   Mutable properties:
-    `trainable`: Whether the layer should be trained (boolean).
-    `input_spec`: Optional (list of) `InputSpec` object(s) specifying the
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
       constraints on inputs that can be accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None, dtype=None,
-               activity_regularizer=None, **kwargs):
-    # We use a kwargs dict here because these kwargs only exist
-    # for compatibility reasons.
-    # The list of kwargs is subject to changes in the future.
-    # We do not want to commit to it or to expose the list to users at all.
-    # Note this is exactly as safe as defining kwargs in the function signature,
-    # the only difference being that the list of valid kwargs is defined
-    # below rather rather in the signature, and default values are defined
-    # in calls to kwargs.get().
-    allowed_kwargs = {
-        '_scope',
-        '_reuse',
-        'input_shape',  # For compatibility with Keras `Sequential` model.
-        'batch_size',  # For compatibility with Keras `Sequential` model.
-    }
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self.stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights.
-    self.built = False
-    # Provides information about which inputs are compatible with the layer.
-    self.input_spec = None
-
-    if activity_regularizer and context.executing_eagerly():
-      raise ValueError(
-          ('Activity regularization is not supported when executing eagerly. '
-           'Got activity_regularizer=%s') % (activity_regularizer,))
-    self._activity_regularizer = activity_regularizer
+               **kwargs):
+    # For backwards compatibility, legacy layers do not use `ResourceVariable`
+    # by default.
+    self._use_resource_variables = False
+    scope = kwargs.pop('_scope', None)
+    self._reuse = kwargs.pop('_reuse', None)
+
+    # Avoid an incorrect lint error
     self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._updates = []
-    # When executing eagerly, _losses is a list of zero-argument lambdas which
-    # return tensors. When using graph execution, _losses is a list of ops.
-    self._losses = []
-    self._reuse = kwargs.get('_reuse')
-    self._graph = None  # Will be set at build time.
-    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    self._call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in self._call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in self._call_fn_args
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self.built = False
 
-    self._init_set_name(name)
+    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
+                                **kwargs)
 
-    # Determine variable scope.
-    scope = kwargs.get('_scope')
+    self._graph = None
+    self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
       with vs.variable_scope(scope) as captured_scope:
         self._scope = captured_scope
     else:
       self._scope = None
+    self._current_scope = None
 
-    # Set `_batch_input_shape` attribute
-    # for compatibility with Keras `Sequential` model.
-    if 'input_shape' in kwargs:
-      batch_size = kwargs.get('batch_size')
-      self._batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+  @property
+  def graph(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.graph not supported when executing eagerly.')
+    return self._graph
 
   def _init_set_name(self, name):
     # Determine layer name (non-unique).
@@ -166,18 +109,15 @@ class Layer(checkpointable.CheckpointableBase):
       self._name, base_name = self._make_unique_name()
     self._base_name = base_name
 
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
+    base_name = base_layer.to_snake_case(self.__class__.__name__)
+    name = base_layer.unique_layer_name(base_name,
+                                        name_uid_map=name_uid_map,
+                                        avoid_names=avoid_names,
+                                        namespace=namespace,
+                                        zero_based=zero_based)
+    return (name, base_name)
 
   @property
   def scope_name(self):
@@ -189,271 +129,16 @@ class Layer(checkpointable.CheckpointableBase):
                        'querying `scope_name`.')
     return self._scope.name
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
-
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored in Eager mode.
-
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
-
-    updates = _to_list(updates)
-    updates = [x if isinstance(x, ops.Operation)
-               else ops.convert_to_tensor(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
-
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Note that when executing eagerly, getting this property evaluates
-    regularizers. When using graph execution, variable regularization ops have
-    already been created and are simply returned here.
-
-    Returns:
-      A list of tensors.
-    """
-    if context.executing_eagerly():
-      # _losses may only contain variable regularization losses when executing
-      # eagerly, and they have been saved as lambdas to be executed when
-      # requested.
-      return [regularizer() for regularizer in self._losses]
-    else:
-      return self._losses
-
   def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
-
-    Arguments:
-      losses: Loss tensor, or list/tuple of tensors.
-      inputs: If anything other than None is passed, it signals the losses
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for activity regularization losses, for instance.
-        If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      # TODO(fchollet): it should be possible (and highly desirable) to support
-      # `add_loss` in eager mode. This allows great convenience and flexibility
-      # in defining custom losses on the fly (e.g. in VAEs).
-      # Simply appending the loss value to `self._losses`
-      # is the correct behavior.
-      # The only caveat is that we need to force the user to only call
-      # `add_loss` from inside a model or Layer's `call` method
-      # (otherwise the loss computation cannot be backproped through).
-      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
-
-    losses = _to_list(losses)
-    self._losses += losses
-    if inputs is None:
-      for loss in losses:
-        loss._unconditional_loss = True  # pylint: disable=protected-access
-    else:
-      for loss in losses:
-        loss._unconditional_loss = False  # pylint: disable=protected-access
+    previous_losses_length = len(self._losses)
+    super(Layer, self).add_loss(losses, inputs=inputs)
     # TODO(fchollet): deprecate collection below.
-    _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def build(self, _):
-    """Creates the variables of the layer."""
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+    new_losses = self._losses[previous_losses_length:]
+    _add_elements_to_collection(new_losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
-    Arguments:
-      inputs: input tensor(s).
-      **kwargs: additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    return inputs
-
-  def _name_scope_name(self, current_variable_scope):
+  def _name_scope(self):
     """Determines op naming for the Layer."""
-    return current_variable_scope.original_name_scope
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Returns:
-      A (possibly nested tuple of) `TensorShape`.
-
-    Raises:
-      TypeError: if `input_shape` is not a (possibly nested tuple of)
-        `TensorShape`.
-      ValueError: if `input_shape` is incomplete or is incompatible with the
-        the layer.
-    """
-    raise NotImplementedError
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = _to_snake_case(self.__class__.__name__)
-    name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names, namespace=namespace,
-                              zero_based=zero_based)
-    return (name, base_name)
+    return self._current_scope.original_name_scope
 
   def _set_scope(self, scope=None):
     if self._scope is None:
@@ -467,10 +152,11 @@ class Layer(checkpointable.CheckpointableBase):
             scope, default_name=self._base_name) as captured_scope:
           self._scope = captured_scope
 
-  def add_variable(self, name, shape, dtype=None,
-                   initializer=None, regularizer=None,
-                   trainable=True, constraint=None,
-                   partitioner=None):
+  def add_weight(self, name, shape, dtype=None,
+                 initializer=None, regularizer=None,
+                 trainable=True, constraint=None,
+                 use_resource=None,
+                 partitioner=None):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -486,6 +172,7 @@ class Layer(checkpointable.CheckpointableBase):
         then this parameter is ignored and any added variables are also
         marked as non-trainable.
       constraint: constraint instance (callable).
+      use_resource: Whether to use `ResourceVariable`.
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
         into multiple partitions according to `partitioner`.  In this case,
@@ -504,10 +191,6 @@ class Layer(checkpointable.CheckpointableBase):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
-
-    # `init_graph` should point to the graph in which variable initialization
-    # will occur; it should be None if and only if initialization will take
-    # place in the eager context.
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -530,71 +213,43 @@ class Layer(checkpointable.CheckpointableBase):
 
     self._set_scope(None)
     reuse = self.built or self._reuse
+    prev_len_trainable = len(self._trainable_weights)
     with vs.variable_scope(
         self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        variable = self._add_variable_with_custom_getter(
-            name=name,
-            shape=shape,
-            getter=vs.get_variable,
-            # Manage errors in Layer rather than Checkpointable.
-            overwrite=True,
-            initializer=initializer,
+      self._current_scope = scope
+      with ops.name_scope(self._name_scope()):
+        use_resource = (use_resource or
+                        self._use_resource_variables or
+                        scope.use_resource)
+        variable = super(Layer, self).add_weight(
+            name,
+            shape,
             dtype=dtypes.as_dtype(dtype),
+            initializer=initializer or scope.initializer,
+            trainable=trainable,
             constraint=constraint,
-            trainable=trainable and self.trainable,
-            partitioner=partitioner)
-
-        if init_graph is not None:  # pylint: disable=protected-access
-          # The variable was created and initialized in a graph.
-
-          if variable in existing_variables:
-            # To match the behavior of tf.get_variable(), we only apply
-            # regularization if the variable is newly created.
-            return variable
-
+            partitioner=partitioner,
+            use_resource=use_resource,
+            getter=vs.get_variable)
+
+        if regularizer:
+          if context.executing_eagerly() or variable not in existing_variables:
+            self._handle_weight_regularization(name, variable, regularizer)
+
+        if init_graph is not None:
+          # Handle edge case where a custom getter has overridden `trainable`.
+          # There is one known occurrence of this, in unit test
+          # testBasicRNNCellNotTrainable in
+          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
           with init_graph.as_default():
             trainable_variables = tf_variables.trainable_variables()
           if (trainable and self.trainable and
               variable not in trainable_variables):
             # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
-
-          if regularizer:
-            if isinstance(variable, tf_variables.PartitionedVariable):
-              for v in variable:
-                with ops.colocate_with(v.op):
-                  with ops.name_scope(name + '/Regularizer'):
-                    regularization = regularizer(v)
-                if regularization is not None:
-                  self.add_loss(regularization)
-            else:
-              with ops.colocate_with(variable.op):
-                with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(variable)
-              if regularization is not None:
-                self.add_loss(regularization)
-        elif regularizer:  # and initialization took place in an eager context
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            raise RuntimeError(
-                'Partitioned variable regularization is not yet '
-                'supported when executing eagerly. File a feature request '
-                'if this is important to you.')
-          # Save a zero-argument lambda which runs the regularizer on the
-          # variable, to be executed when `Layer.losses` is requested.
-          # This makes losses responsive to variable updates when executing
-          # eagerly.
-          #
-          # TODO(akshayka): Do the same for graphs as well, so that losses
-          # collected in a while_loop can be run outside its control flow
-          # context and so that losses won't be swallowed up by graph functions
-          # (i.e., `.losses()` should always create regularizers).
-          self._losses.append(lambda: regularizer(variable))
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
+            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
+            self._trainable_weights = self._trainable_weights[
+                :prev_len_trainable]
+            self._non_trainable_weights += extra_trainable_vars
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -622,35 +277,14 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     self._set_scope(kwargs.pop('scope', None))
-    input_list = nest.flatten(inputs)
 
-    build_graph = not context.executing_eagerly()
-    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
-    # which don't use an "inputs" argument.
-    in_deferred_mode = isinstance(input_list[0], _DeferredTensor)
-    # Ensure the Layer, if being reused, is working with inputs from
-    # the same graph as where it was created.
-    if build_graph:
+    if not context.executing_eagerly():
       try:
         # Set layer's "graph" at build time
-        self._graph = ops._get_graph_from_inputs(input_list, graph=self._graph)  # pylint: disable=protected-access
+        self._graph = ops._get_graph_from_inputs(nest.flatten(inputs),  # pylint: disable=protected-access
+                                                 graph=self._graph)
       except ValueError as e:
         raise ValueError('Input graph and Layer graph are not the same: %s' % e)
-    if build_graph or in_deferred_mode:
-      user_kwargs = copy.copy(kwargs)
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
-      previous_mask = _collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = estimator_util.fn_args(self.call)
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not _is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
 
     if self.built:
       try:
@@ -667,134 +301,27 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-    input_shapes = None
-    with scope_context_manager as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        if not self.built:
-          if not build_graph:
-            # Activity regularization is currently unsupported in Eager mode.
-            if self._activity_regularizer:
-              raise ValueError(
-                  'activity_regularizer currently unsupported with '
-                  'eager execution enabled. Found an activity_regularizer in '
-                  '%s(%s).' % (self.__class__.__name__, self))
-          if not build_graph and not in_deferred_mode:
-            # TODO(agarwal): support _keras_history in Eager mode.
-            for x in input_list:
-              if hasattr(x, '_keras_history'):
-                raise ValueError('_keras_history currently unsupported in '
-                                 'Eager mode. Found _keras_history in %s while '
-                                 'executing __call__ for %s(%s)' %
-                                 (x, self.__class_.__name__, self))
-
-          # Check input assumptions set before layer building, e.g. input rank.
-          self._assert_input_compatibility(inputs)
-          if input_list and self._dtype is None:
-            try:
-              self._dtype = input_list[0].dtype.base_dtype.name
-            except AttributeError:
-              pass
-          if all(hasattr(x, 'get_shape') for x in input_list):
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-          self.build(input_shapes)
-        try:
-          # Note: not all sub-classes of Layer call Layer.__init__ (especially
-          # the ones under tensorflow/python/keras). Hence we recompute this
-          # attribute here if it is not set.
-          # TODO(agarwal): Fix the sub-classes and avoid this complexity.
-          call_has_scope_arg = self._call_has_scope_arg
-        except AttributeError:
-          self._call_fn_args = estimator_util.fn_args(self.call)
-          self._call_has_scope_arg = 'scope' in self._call_fn_args
-          call_has_scope_arg = self._call_has_scope_arg
-        if call_has_scope_arg:
-          kwargs['scope'] = scope
-        # Check input assumptions set after layer building, e.g. input shape.
-        if build_graph or in_deferred_mode:
-          self._assert_input_compatibility(inputs)
-
-        if not in_deferred_mode:
-          outputs = self.call(inputs, *args, **kwargs)
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a Tensor '
-                             'or a list of Tensors, not None.')
-        else:
-          # Deferred mode behavior: use `compute_output_shape` to
-          # infer the number of outputs of the layer and their shapes.
-          if input_shapes is None:
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-
-          output_shapes = self.compute_output_shape(input_shapes)
-          output_shapes = nest.flatten(output_shapes)
-          outputs = [
-              # TODO(fchollet): name the deferred tensors?
-              _DeferredTensor(shape=shape, dtype=self._dtype)
-              for shape in output_shapes
-          ]
-          if len(outputs) == 1:
-            outputs = outputs[0]
 
-        if build_graph:
-          # Apply activity regularization.
-          # Note that it should be applied every time the layer creates a new
-          # output, since it is output-specific.
-          if self._activity_regularizer:
-            output_list = nest.flatten(outputs)
-            for output in output_list:
-              with ops.name_scope('ActivityRegularizer'):
-                activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization, inputs=inputs)
+    with scope_context_manager as scope:
+      self._current_scope = scope
 
-          # TODO(fchollet): consider enabling masking for Eager mode.
-          if hasattr(self, 'compute_mask'):
-            output_mask = self.compute_mask(inputs, previous_mask)
-            if isinstance(outputs, (list, tuple)):
-              if output_mask is None:
-                output_mask = [None for _ in range(len(outputs))]
-              for x, m in zip(outputs, output_mask):
-                x._keras_mask = m  # pylint: disable=protected-access
-            else:
-              outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      try:
+        call_has_scope_arg = self._call_has_scope_arg
+      except AttributeError:
+        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_has_scope_arg = 'scope' in self._call_fn_args
+        call_has_scope_arg = self._call_has_scope_arg
+      if call_has_scope_arg:
+        kwargs['scope'] = scope
 
-    if build_graph:
-      # If all input tensors have history metadata,
-      # we update the output tensors
-      # with corresponding history metadata, thus eventually allowing to use
-      # these tensors to instantiate a Network.
-      if _have_all_keras_metadata(inputs):
-        # If the layer returns tensors from its inputs, unmodified,
-        # we copy them to avoid loss of tensor metadata.
-        output_ls = nest.flatten(outputs)
-        output_ls_copy = []
-        for x in output_ls:
-          if x in input_list:
-            with ops.name_scope(scope.original_name_scope):
-              x = array_ops.identity(x)
-          output_ls_copy.append(x)
-        if len(output_ls_copy) == 1:
-          outputs = output_ls_copy[0]
-        else:
-          outputs = output_ls_copy
+      # Actually call layer
+      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
 
+    if not context.executing_eagerly():
       # Update global default collections.
       _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
-
-    if in_deferred_mode or build_graph:
-      if _have_all_keras_metadata(inputs):
-        # Add an inbound node to the layer, so it can keep track of this call.
-        # This updates the layer history of the output tensor(s).
-        self._add_inbound_node(
-            input_tensors=inputs, output_tensors=outputs, arguments=user_kwargs)
-
-    self.built = True
     return outputs
 
-  @property
-  def graph(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.graph not supported in Eager mode.')
-    return self._graph
-
   def __deepcopy__(self, memo):
     no_copy = set(['_graph'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
@@ -806,658 +333,12 @@ class Layer(checkpointable.CheckpointableBase):
         setattr(result, k, v)
       elif k in shallow_copy:
         setattr(result, k, copy.copy(v))
-      elif _is_tensor_or_tensor_list(v):
+      elif base_layer.is_tensor_or_tensor_list(v):
         setattr(result, k, v)
       else:
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
-
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          'Layer.get_output_shape_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_input_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined input shape.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           ' has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if self.__class__.__name__ == 'Sequential':
-        self.build()  # pylint: disable=no-value-for-parameter
-      else:
-        raise ValueError('You tried to call `count_params` on ' + self.name +
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: `' + self.name +
-                         '.build(batch_input_shape)`.')
-    weight_shapes = [w.get_shape().as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
-
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  def inbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._inbound_nodes
-
-  @property
-  def outbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._outbound_nodes
-
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.get_shape().ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
-
-@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
-
-
-class Node(object):
-  """A `Node` describes the connectivity between two layers.
-
-  Each time a layer is connected to some new input,
-  a node is added to `layer._inbound_nodes`.
-  Each time the output of a layer is used by another layer,
-  a node is added to `layer._outbound_nodes`.
-
-  Arguments:
-      outbound_layer: the layer that takes
-          `input_tensors` and turns them into `output_tensors`
-          (the node gets created when the `call`
-          method of the layer was called).
-      inbound_layers: a list of layers, the same length as `input_tensors`,
-          the layers from where `input_tensors` originate.
-      node_indices: a list of integers, the same length as `inbound_layers`.
-          `node_indices[i]` is the origin node of `input_tensors[i]`
-          (necessary since each inbound layer might have several nodes,
-          e.g. if the layer is being shared with a different data stream).
-      tensor_indices: a list of integers,
-          the same length as `inbound_layers`.
-          `tensor_indices[i]` is the index of `input_tensors[i]` within the
-          output of the inbound layer
-          (necessary since each inbound layer might
-          have multiple tensor outputs, with each one being
-          independently manipulable).
-      input_tensors: list of input tensors.
-      output_tensors: list of output tensors.
-      arguments: dictionary of keyword arguments that were passed to the
-          `call` method of the layer at the call that created the node.
-
-  `node_indices` and `tensor_indices` are basically fine-grained coordinates
-  describing the origin of the `input_tensors`.
-
-  A node from layer A to layer B is added to:
-    - A._outbound_nodes
-    - B._inbound_nodes
-  """
-
-  def __init__(self,
-               outbound_layer,
-               inbound_layers,
-               node_indices,
-               tensor_indices,
-               input_tensors,
-               output_tensors,
-               arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
-    # the current node will be added to
-    # the inbound_nodes of outbound_layer.
-    self.outbound_layer = outbound_layer
-
-    # The following 3 properties describe where
-    # the input tensors come from: which layers,
-    # and for each layer, which node and which
-    # tensor output of each node.
-
-    # List of layer instances.
-    self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.tensor_indices = tensor_indices
-
-    # Following 2 properties:
-    # tensor inputs and outputs of outbound_layer.
-
-    # List of tensors. 1:1 mapping with inbound_layers.
-    self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
-    self.output_tensors = output_tensors
-
-    # Following 2 properties: input and output shapes.
-
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [layers_util.static_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [layers_util.static_shape(x) for x in output_tensors]
-
-    # Optional keyword arguments to layer's `call`.
-    self.arguments = arguments
-
-    # Add nodes to all layers involved.
-    for layer in inbound_layers:
-      if layer is not None:
-        # For compatibility with external Keras, we use the deprecated
-        # accessor here.
-        layer.outbound_nodes.append(self)
-    # For compatibility with external Keras, we use the deprecated
-    # accessor here.
-    outbound_layer.inbound_nodes.append(self)
-
-  def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
-    return {
-        'outbound_layer': self.outbound_layer.name,
-        'inbound_layers': inbound_names,
-        'node_indices': self.node_indices,
-        'tensor_indices': self.tensor_indices
-    }
-
-
-class _DeferredTensor(object):
-  """Tensor-like object used to build graphs of layers in Eager mode.
-
-  When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
-  DeferredTensors with appropriate shape information. Thus DeferredTensor
-  behaves like a graph-mode Tensor when manipulated by layers.
-  """
-
-  def __init__(self, shape, dtype, name=None):
-    self.shape = tensor_shape.TensorShape(shape)
-    if dtype is None:
-      self.dtype = dtypes.as_dtype(np.float32)
-    else:
-      self.dtype = dtypes.as_dtype(dtype)
-    self.name = name
-
-  def get_shape(self):
-    return self.shape
-
-  def __str__(self):
-    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-  def __repr__(self):
-    return "<_DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-
-def _is_tensor_or_tensor_list(v):
-  v = nest.flatten(v)
-  if v and isinstance(v[0], ops.Tensor):
-    return True
-  else:
-    return False
-
-
-def _to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def _to_list(x):
-  """This normalizes a list/tuple or single element into a list.
-
-  If a single element is passed, we return
-  a list of size 1 containing the element.
-
-  Arguments:
-    x: list or tuple or single element.
-
-  Returns:
-    A list.
-  """
-  if isinstance(x, (list, tuple)):
-    return list(x)
-  return [x]
-
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
@@ -1473,105 +354,3 @@ def _add_elements_to_collection(elements, collection_list):
       if element not in collection_set:
         collection.append(element)
 
-
-def _is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
-
-
-def _have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def _collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-# A global dictionary mapping graph objects to an index of counters used
-# for various layer names in each graph.
-# Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
-def _get_default_graph_uid_map():
-  graph = ops.get_default_graph()
-  name_uid_map = PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
-    PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                       zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = _get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 9ed4afeaba931c47d2a1e65f08489773f0b9eb1b..f08b552840f5ff9144edae1cb0f90a1bc3db0f8c 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -52,6 +52,12 @@ class BaseLayerTest(test.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInt64Layer(self):
+    layer = base_layers.Layer(name='my_layer', dtype='int64')
+    layer.add_variable('my_var', [2, 2])
+    self.assertEqual(layer.name, 'my_layer')
+
   @test_util.run_in_graph_and_eager_modes()
   def testAddWeight(self):
     layer = base_layers.Layer(name='my_layer')
@@ -94,61 +100,6 @@ class BaseLayerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
         core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
 
-  def testGetVariable(self):
-    with self.test_session():
-
-      class MyLayer(base_layers.Layer):
-
-        def build(self, input_shape):
-          self.my_var = self.add_variable(
-              'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-
-        def call(self, inputs):
-          return inputs * 2
-
-      layer = MyLayer(name='my_layer')
-      inputs = random_ops.random_uniform((5,), seed=1)
-      layer.apply(inputs)
-      layer.apply(inputs)
-      self.assertEqual([v.name for v in layer.variables],
-                       ['my_layer/my_var:0'])
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time.  It uses scope "<current scope>/base_name"
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope'):
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time. If 'scope' argument is passed to
-      # apply(), it uses that scope when accessing variables.
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope') as new_scope:
-        variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs, scope=new_scope)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope')
-
-      # Checking for graph equality is only done in GRAPH mode.
-      with ops.Graph().as_default():
-        inputs_ng = random_ops.random_uniform((5,), seed=1)
-        with self.assertRaisesRegexp(ValueError, r'graph are not the same'):
-          layer.apply(inputs_ng)
-
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
 
@@ -165,38 +116,6 @@ class BaseLayerTest(test.TestCase):
       # op is only supported in GRAPH mode
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
-    # Note that this test is only run in Graph mode since with EAGER mode we can
-    # still create a new variable on second call.
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, _):
-        # Do not mark the layer as built.
-        pass
-
-      def call(self, inputs):
-        self.my_var = self.add_variable('my_var', [2, 2])
-        if self.built:
-          # Skip creating on the first call; try to create after it's
-          # built.  This is expected to fail.
-          self.add_variable('this_will_break_on_second_call', [2, 2])
-        return inputs + math_ops.square(self.my_var)
-
-    layer = MyLayer(name='my_layer')
-    inputs = random_ops.random_uniform((2,), seed=1)
-    outputs = layer.apply(inputs)
-    self.assertEqual(layer.built, True)
-    self.assertEqual(outputs.op.name, 'my_layer/add')
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-    with self.assertRaisesRegexp(ValueError,
-                                 'my_layer/this_will_break_on_second_call'):
-      layer.apply(inputs)
-    # The list of variables hasn't changed.
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-
   @test_util.run_in_graph_and_eager_modes()
   def testDeepCopy(self):
 
@@ -645,13 +564,14 @@ class BaseLayerTest(test.TestCase):
 
   def testLayerGraphSetInFirstApply(self):
     with ops.Graph().as_default():
-      layer = core_layers.Dense(1)  # Graph at construction time is ignored
+      # Graph at construction time is ignored
+      layer = core_layers.Dense(1)
     with ops.Graph().as_default():
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
       # layer is now bound to second Graph
     with ops.Graph().as_default(), self.assertRaisesRegexp(
         ValueError, 'Input graph and Layer graph are not the same'):
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 2d99b1688f1b2736c0660ba2ac914018b21bf9ed..34a1487e748e41eebae8b87b17c34d0deda8597f 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -32,201 +33,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Conv(base.Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_Conv, self).__init__(trainable=trainable, name=name,
-                                activity_regularizer=activity_regularizer,
-                                **kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
-    self.strides = utils.normalize_tuple(strides, rank, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.dilation_rate = utils.normalize_tuple(
-        dilation_rate, rank, 'dilation_rate')
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    self._convolution_op = nn_ops.Convolution(
-        input_shape,
-        filter_shape=self.kernel.get_shape(),
-        dilation_rate=self.dilation_rate,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format,
-                                              self.rank + 2))
-    self.built = True
-
-  def call(self, inputs):
-    outputs = self._convolution_op(inputs, self.kernel)
-
-    if self.use_bias:
-      if self.data_format == 'channels_first':
-        if self.rank == 1:
-          # nn.bias_add does not accept a 1D input tensor.
-          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
-          outputs += bias
-        if self.rank == 2:
-          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
-      else:
-        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-
 @tf_export('layers.Conv1D')
-class Conv1D(_Conv):
+class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -294,8 +102,7 @@ class Conv1D(_Conv):
                trainable=True,
                name=None,
                **kwargs):
-    super(Convolution1D, self).__init__(
-        rank=1,
+    super(Conv1D, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -417,7 +224,7 @@ def conv1d(inputs,
 
 
 @tf_export('layers.Conv2D')
-class Conv2D(_Conv):
+class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -493,7 +300,6 @@ class Conv2D(_Conv):
                name=None,
                **kwargs):
     super(Conv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -622,7 +428,7 @@ def conv2d(inputs,
 
 
 @tf_export('layers.Conv3D')
-class Conv3D(_Conv):
+class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -699,7 +505,6 @@ class Conv3D(_Conv):
                name=None,
                **kwargs):
     super(Conv3D, self).__init__(
-        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -828,169 +633,8 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-class _SeparableConv(_Conv):
-  """Abstract base layer for separable nD convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_SeparableConv, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = depthwise_initializer
-    self.pointwise_initializer = pointwise_initializer
-    self.depthwise_regularizer = depthwise_regularizer
-    self.pointwise_regularizer = pointwise_regularizer
-    self.depthwise_constraint = depthwise_constraint
-    self.pointwise_constraint = pointwise_constraint
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-    pointwise_kernel_shape = (
-        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
-
-    self.depthwise_kernel = self.add_variable(
-        name='depthwise_kernel',
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    self.pointwise_kernel = self.add_variable(
-        name='pointwise_kernel',
-        shape=pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-
 @tf_export('layers.SeparableConv1D')
-class SeparableConv1D(_SeparableConv):
+class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1072,7 +716,6 @@ class SeparableConv1D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv1D, self).__init__(
-        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1096,45 +739,9 @@ class SeparableConv1D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-
-    # Explicitly broadcast inputs and kernels to 4D.
-    # TODO(fchollet): refactor when a native separable_conv1d op is available.
-    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
-    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    outputs = nn.separable_conv2d(
-        inputs,
-        depthwise_kernel,
-        pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.SeparableConv2D')
-class SeparableConv2D(_SeparableConv):
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1221,7 +828,6 @@ class SeparableConv2D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1245,31 +851,6 @@ class SeparableConv2D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    # Apply the actual ops.
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides + (1,)
-    else:
-      strides = (1, 1) + self.strides
-    outputs = nn.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=self.dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.separable_conv1d')
 def separable_conv1d(inputs,
@@ -1511,7 +1092,7 @@ def separable_conv2d(inputs,
 
 
 @tf_export('layers.Conv2DTranspose')
-class Conv2DTranspose(Conv2D):
+class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -1576,8 +1157,8 @@ class Conv2DTranspose(Conv2D):
                name=None,
                **kwargs):
     super(Conv2DTranspose, self).__init__(
-        filters,
-        kernel_size,
+        filters=filters,
+        kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
@@ -1593,120 +1174,6 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' +
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv2d_transpose')
@@ -1806,7 +1273,7 @@ def conv2d_transpose(inputs,
 
 
 @tf_export('layers.Conv3DTranspose')
-class Conv3DTranspose(Conv3D):
+class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
   Arguments:
@@ -1885,153 +1352,6 @@ class Conv3DTranspose(Conv3D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5, received input shape:',
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined, found None: ' + str(input_shape))
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(
-        'kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(
-          'bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = base.InputSpec(ndim=5,
-                                     axes={c_axis: inputs_shape[c_axis]})
-
-    depth = inputs_shape[d_axis]
-    height = inputs_shape[h_axis]
-    width = inputs_shape[w_axis]
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_depth = utils.deconv_output_length(depth,
-                                           kernel_d,
-                                           self.padding,
-                                           stride_d)
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_depth, out_height,
-                      out_width)
-      strides = (1, 1, stride_d, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_depth, out_height, out_width,
-                      self.filters)
-      strides = (1, stride_d, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv3d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        data_format=utils.convert_data_format(self.data_format, ndim=5),
-        padding=self.padding.upper())
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
-                                                     kernel_d,
-                                                     self.padding,
-                                                     stride_d)
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv3d_transpose')
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e598d9f83ab21f2dd5fabb3dd37fa0bfb5f003a4..6d8e9eac878bb2eb65bfa29e872a0576a39af662 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,23 +27,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.Dense')
-class Dense(base.Layer):
+class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -108,73 +99,19 @@ class Dense(base.Layer):
                trainable=True,
                name=None,
                **kwargs):
-    super(Dense, self).__init__(trainable=trainable, name=name,
+    super(Dense, self).__init__(units=units,
+                                activation=activation,
+                                use_bias=use_bias,
+                                kernel_initializer=kernel_initializer,
+                                bias_initializer=bias_initializer,
+                                kernel_regularizer=kernel_regularizer,
+                                bias_regularizer=bias_regularizer,
                                 activity_regularizer=activity_regularizer,
+                                kernel_constraint=kernel_constraint,
+                                bias_constraint=bias_constraint,
+                                trainable=trainable,
+                                name=name,
                                 **kwargs)
-    self.units = units
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(min_ndim=2,
-                                     axes={-1: input_shape[-1].value})
-    self.kernel = self.add_variable('kernel',
-                                    shape=[input_shape[-1].value, self.units],
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    if self.use_bias:
-      self.bias = self.add_variable('bias',
-                                    shape=[self.units,],
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-    shape = inputs.get_shape().as_list()
-    if len(shape) > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
-                                                             [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not context.executing_eagerly():
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          'The innermost dimension of input_shape must be defined, but saw: %s'
-          % input_shape)
-    return input_shape[:-1].concatenate(self.units)
 
 
 @tf_export('layers.dense')
@@ -254,7 +191,7 @@ def dense(
 
 
 @tf_export('layers.Dropout')
-class Dropout(base.Layer):
+class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -282,31 +219,14 @@ class Dropout(base.Layer):
                seed=None,
                name=None,
                **kwargs):
-    super(Dropout, self).__init__(name=name, **kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return self.noise_shape
-    return nn_ops._get_noise_shape(inputs, self.noise_shape)
+    super(Dropout, self).__init__(rate=rate,
+                                  noise_shape=noise_shape,
+                                  seed=seed,
+                                  name=name,
+                                  **kwargs)
 
   def call(self, inputs, training=False):
-
-    def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
-    return utils.smart_cond(training,
-                            dropped_inputs,
-                            lambda: array_ops.identity(inputs))
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(Dropout, self).call(inputs, training=training)
 
 
 @tf_export('layers.dropout')
@@ -352,7 +272,7 @@ def dropout(inputs,
 
 
 @tf_export('layers.Flatten')
-class Flatten(base.Layer):
+class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Examples:
@@ -367,25 +287,7 @@ class Flatten(base.Layer):
     # now `y` has shape `(None, None)`
   ```
   """
-
-  def __init__(self, **kwargs):
-    super(Flatten, self).__init__(**kwargs)
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def call(self, inputs):
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
-    if not context.executing_eagerly():
-      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = [input_shape[0]]
-    if all(input_shape[1:]):
-      output_shape += [np.prod(input_shape[1:])]
-    else:
-      output_shape += [None]
-    return tensor_shape.TensorShape(output_shape)
+  pass
 
 
 @tf_export('layers.flatten')
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 83b201e6420d48cfab38048d6638a9f9185d7d6c..33284b0d695272db5a4e0d757d6f24b1930068de 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,26 +24,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import moving_averages
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.BatchNormalization')
-class BatchNormalization(base.Layer):
+class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -143,485 +131,31 @@ class BatchNormalization(base.Layer):
                name=None,
                **kwargs):
     super(BatchNormalization, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    if isinstance(axis, list):
-      self.axis = axis[:]
-    else:
-      self.axis = axis
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = beta_initializer
-    self.gamma_initializer = gamma_initializer
-    self.moving_mean_initializer = moving_mean_initializer
-    self.moving_variance_initializer = moving_variance_initializer
-    self.beta_regularizer = beta_regularizer
-    self.gamma_regularizer = gamma_regularizer
-    self.beta_constraint = beta_constraint
-    self.gamma_constraint = gamma_constraint
-    self.renorm = renorm
-    self.virtual_batch_size = virtual_batch_size
-    self.adjustment = adjustment
-    if fused is None:
-      fused = True
-
-    self.fused = fused
-    self._bessels_correction_test_only = True
-
-    if renorm:
-      renorm_clipping = renorm_clipping or {}
-      keys = ['rmax', 'rmin', 'dmax']
-      if set(renorm_clipping) - set(keys):
-        raise ValueError('renorm_clipping %s contains keys not in %s' %
-                         (renorm_clipping, keys))
-      self.renorm_clipping = renorm_clipping
-      self.renorm_momentum = renorm_momentum
-
-  def _add_tower_local_variable(self, *args, **kwargs):
-    tower_context = distribute_lib.get_tower_context()
-    with tower_context.tower_local_var_scope('mean'):
-      return self.add_variable(*args, **kwargs)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if not input_shape.ndims:
-      raise ValueError('Input has undefined rank:', input_shape)
-    ndims = len(input_shape)
-
-    # Convert axis to list and resolve negatives
-    if isinstance(self.axis, int):
-      self.axis = [self.axis]
-
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
-    for idx, x in enumerate(self.axis):
-      if x < 0:
-        self.axis[idx] = ndims + x
-
-    # Validate axes
-    for x in self.axis:
-      if x < 0 or x >= ndims:
-        raise ValueError('Invalid axis: %d' % x)
-    if len(self.axis) != len(set(self.axis)):
-      raise ValueError('Duplicate axis: %s' % self.axis)
-
-    if self.virtual_batch_size is not None:
-      if self.virtual_batch_size <= 0:
-        raise ValueError('virtual_batch_size must be a positive integer that '
-                         'divides the true batch size of the input Tensor')
-      # If using virtual batches, the first dimension must be the batch
-      # dimension and cannot be the batch norm axis
-      if 0 in self.axis:
-        raise ValueError('When using virtual_batch_size, the batch dimension '
-                         'must be 0 and thus axis cannot include 0')
-      if self.adjustment is not None:
-        raise ValueError('When using virtual_batch_size, adjustment cannot '
-                         'be specified')
-
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
-      # TODO(chrisying): fused batch norm is currently not supported for
-      # multi-axis batch norm and by extension virtual batches. In some cases,
-      # it might be possible to use fused batch norm but would require reshaping
-      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
-      # particularly tricky. A compromise might be to just support the most
-      # common use case (turning 5D w/ virtual batch to NCHW)
-
-    if self.fused:
-      if self.axis == [1]:
-        self._data_format = 'NCHW'
-      elif self.axis == [3]:
-        self._data_format = 'NHWC'
-      else:
-        raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
-
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
-      param_dtype = dtypes.float32
-    else:
-      param_dtype = self.dtype or dtypes.float32
-
-    axis_to_dim = {x: input_shape[x].value for x in self.axis}
-    for x in axis_to_dim:
-      if axis_to_dim[x] is None:
-        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
-                         input_shape)
-    self.input_spec = base.InputSpec(ndim=ndims, axes=axis_to_dim)
-
-    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
-      # Single axis batch norm (most common/default use-case)
-      param_shape = (list(axis_to_dim.values())[0],)
-    else:
-      # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
-      if self.virtual_batch_size is not None:
-        # When using virtual batches, add an extra dim at index 1
-        param_shape.insert(1, 1)
-        for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
-
-    if self.scale:
-      self.gamma = self.add_variable(
-          name='gamma',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True)
-    else:
-      self.gamma = None
-      if self.fused:
-        self._gamma_const = array_ops.constant(
-            1.0, dtype=param_dtype, shape=param_shape)
-
-    if self.center:
-      self.beta = self.add_variable(
-          name='beta',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True)
-    else:
-      self.beta = None
-      if self.fused:
-        self._beta_const = array_ops.constant(
-            0.0, dtype=param_dtype, shape=param_shape)
-
-    # Disable variable partitioning when creating the moving mean and variance
-    try:
-      if self._scope:
-        partitioner = self._scope.partitioner
-        self._scope.set_partitioner(None)
-      else:
-        partitioner = None
-      self.moving_mean = self._add_tower_local_variable(
-          name='moving_mean',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_mean_initializer,
-          trainable=False)
-
-      self.moving_variance = self._add_tower_local_variable(
-          name='moving_variance',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_variance_initializer,
-          trainable=False)
-
-      if self.renorm:
-        # Create variables to maintain the moving mean and standard deviation.
-        # These are used in training and thus are different from the moving
-        # averages above. The renorm variables are colocated with moving_mean
-        # and moving_variance.
-        # NOTE: below, the outer `with device` block causes the current device
-        # stack to be cleared. The nested ones use a `lambda` to set the desired
-        # device and ignore any devices that may be set by the custom getter.
-        def _renorm_variable(name, shape):
-          var = self._add_tower_local_variable(
-              name=name,
-              shape=shape,
-              dtype=param_dtype,
-              initializer=init_ops.zeros_initializer(),
-              trainable=False)
-          return var
-
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_mean):
-          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
-          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
-        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
-        # renorm_stddev_weight. This allows us to (1) mix the average
-        # stddev with the minibatch stddev early in training, and (2) compute
-        # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_variance):
-          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
-          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
-                                                       ())
-    finally:
-      if partitioner:
-        self._scope.set_partitioner(partitioner)
-    self.built = True
-
-  def _assign_moving_average(self, variable, value, momentum):
-    with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, momentum]) as scope:
-      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
-      if decay.dtype != variable.dtype.base_dtype:
-        decay = math_ops.cast(decay, variable.dtype.base_dtype)
-      update_delta = (variable - value) * decay
-      return state_ops.assign_sub(variable, update_delta, name=scope)
-
-  def _fused_batch_norm(self, inputs, training):
-    """Returns the output of fused batch norm."""
-    beta = self.beta if self.center else self._beta_const
-    gamma = self.gamma if self.scale else self._gamma_const
-
-    def _fused_batch_norm_training():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          epsilon=self.epsilon,
-          data_format=self._data_format)
-
-    def _fused_batch_norm_inference():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=self.moving_variance,
-          epsilon=self.epsilon,
-          is_training=False,
-          data_format=self._data_format)
-
-    output, mean, variance = utils.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
-    if not self._bessels_correction_test_only:
-      # Remove Bessel's correction to be consistent with non-fused batch norm.
-      # Note that the variance computed by fused batch norm is
-      # with Bessel's correction.
-      sample_size = math_ops.cast(
-          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
-      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
-      variance *= factor
-
-    training_value = utils.constant_value(training)
-    if training_value is None:
-      momentum = utils.smart_cond(training, lambda: self.momentum, lambda: 1.0)
-    else:
-      momentum = ops.convert_to_tensor(self.momentum)
-    if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                momentum)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, momentum)
-      self.add_update(mean_update, inputs=inputs)
-      self.add_update(variance_update, inputs=inputs)
-
-    return output
-
-  def _renorm_correction_and_moments(self, mean, variance, training):
-    """Returns the correction and update values for renorm."""
-    stddev = math_ops.sqrt(variance + self.epsilon)
-    # Compute the average mean and standard deviation, as if they were
-    # initialized with this batch's moments.
-    mixed_renorm_mean = (self.renorm_mean +
-                         (1. - self.renorm_mean_weight) * mean)
-    mixed_renorm_stddev = (self.renorm_stddev +
-                           (1. - self.renorm_stddev_weight) * stddev)
-    # Compute the corrections for batch renorm.
-    r = stddev / mixed_renorm_stddev
-    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
-    # Ensure the corrections use pre-update moving averages.
-    with ops.control_dependencies([r, d]):
-      mean = array_ops.identity(mean)
-      stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
-    if rmin is not None:
-      r = math_ops.maximum(r, rmin)
-    if rmax is not None:
-      r = math_ops.minimum(r, rmax)
-    if dmax is not None:
-      d = math_ops.maximum(d, -dmax)
-      d = math_ops.minimum(d, dmax)
-    # When not training, use r=1, d=0.
-    r = utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d))
-
-    def _update_renorm_variable(var, weight, value):
-      """Updates a moving average and weight, returns the unbiased value."""
-      value = array_ops.identity(value)
-      def _do_update():
-        """Updates the var and weight, returns their updated ratio."""
-        # Update the variables without zero debiasing. The debiasing will be
-        # accomplished by dividing the exponential moving average by the weight.
-        # For example, after a single update, the moving average would be
-        # (1-decay) * value. and the weight will be 1-decay, with their ratio
-        # giving the value.
-        # Make sure the weight is not updated until before r and d computation.
-        with ops.control_dependencies([value]):
-          weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
-        new_weight = self._assign_moving_average(weight, weight_value,
-                                                 self.renorm_momentum)
-        # TODO(yuefengz): the updates to var and weighted can not be batched
-        # together if we fetch their updated values here. Consider calculating
-        # new values and delaying the updates.
-        return new_var / new_weight
-
-      def _fake_update():
-        return array_ops.identity(var)
-      return utils.smart_cond(training, _do_update, _fake_update)
-
-    # TODO(yuefengz): colocate the operations
-    new_mean = _update_renorm_variable(self.renorm_mean,
-                                       self.renorm_mean_weight, mean)
-    new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                         self.renorm_stddev_weight, stddev)
-    # Make sqrt(moving_variance + epsilon) = new_stddev.
-    new_variance = math_ops.square(new_stddev) - self.epsilon
-
-    return (r, d, new_mean, new_variance)
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        beta_constraint=beta_constraint,
+        gamma_constraint=gamma_constraint,
+        renorm=renorm,
+        renorm_clipping=renorm_clipping,
+        renorm_momentum=renorm_momentum,
+        fused=fused,
+        trainable=trainable,
+        virtual_batch_size=virtual_batch_size,
+        adjustment=adjustment,
+        name=name,
+        **kwargs)
 
   def call(self, inputs, training=False):
-    in_eager_mode = context.executing_eagerly()
-    if self.virtual_batch_size is not None:
-      # Virtual batches (aka ghost batches) can be simulated by reshaping the
-      # Tensor and reusing the existing batch norm implementation
-      original_shape = [-1] + inputs.shape.as_list()[1:]
-      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
-
-      # Will cause errors if virtual_batch_size does not divide the batch size
-      inputs = array_ops.reshape(inputs, expanded_shape)
-
-      def undo_virtual_batching(outputs):
-        outputs = array_ops.reshape(outputs, original_shape)
-        return outputs
-
-    if self.fused:
-      outputs = self._fused_batch_norm(inputs, training=training)
-      if self.virtual_batch_size is not None:
-        # Currently never reaches here since fused_batch_norm does not support
-        # virtual batching
-        return undo_virtual_batching(outputs)
-      return outputs
-
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.get_shape()
-    ndims = len(input_shape)
-    reduction_axes = [i for i in range(ndims) if i not in self.axis]
-    if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
-
-    # Broadcasting only necessary for single-axis batch norm where the axis is
-    # not the last dimension
-    broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
-    def _broadcast(v):
-      if (v is not None and
-          len(v.get_shape()) != ndims and
-          reduction_axes != list(range(ndims - 1))):
-        return array_ops.reshape(v, broadcast_shape)
-      return v
-
-    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-    def _compose_transforms(scale, offset, then_scale, then_offset):
-      if then_scale is not None:
-        scale *= then_scale
-        offset *= then_scale
-      if then_offset is not None:
-        offset += then_offset
-      return (scale, offset)
-
-    # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = utils.constant_value(training)
-    if training_value is not False:
-      if self.adjustment:
-        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
-        # Adjust only during training.
-        adj_scale = utils.smart_cond(training,
-                                     lambda: adj_scale,
-                                     lambda: array_ops.ones_like(adj_scale))
-        adj_bias = utils.smart_cond(training,
-                                    lambda: adj_bias,
-                                    lambda: array_ops.zeros_like(adj_bias))
-        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
-
-      # Some of the computations here are not necessary when training==False
-      # but not a constant. However, this makes the code simpler.
-      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
-
-      moving_mean = self.moving_mean
-      moving_variance = self.moving_variance
-
-      mean = utils.smart_cond(training,
-                              lambda: mean,
-                              lambda: moving_mean)
-      variance = utils.smart_cond(training,
-                                  lambda: variance,
-                                  lambda: moving_variance)
-
-      if self.renorm:
-        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
-        # When training, the normalized values (say, x) will be transformed as
-        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
-        # = x * (r * gamma) + (d * gamma + beta) with renorm.
-        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
-        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
-        scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
-
-      def _do_update(var, value):
-        if in_eager_mode and not self.trainable:
-          return
-
-        return self._assign_moving_average(var, value, self.momentum)
-
-      mean_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_mean, new_mean),
-          lambda: self.moving_mean)
-      variance_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_variance, new_variance),
-          lambda: self.moving_variance)
-      if not context.executing_eagerly():
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
-
-    else:
-      mean, variance = self.moving_mean, self.moving_variance
-
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
-                                     self.epsilon)
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if self.virtual_batch_size is not None:
-      return undo_virtual_batching(outputs)
-
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(BatchNormalization, self).call(inputs, training=training)
 
 
 @tf_export('layers.batch_normalization')
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 50503ce093fbc251b11c4d5cbccb2a2683d92e7a..75abe56f51f2a206ea3e5a5dad032446c150293a 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -13,92 +13,19 @@
 # limitations under the License.
 # =============================================================================
 
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the pooling layer classes and their functional aliases.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Pooling1D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 1D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling1D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 1, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=3)
-
-  def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                      self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
-
-
 @tf_export('layers.AveragePooling1D')
-class AveragePooling1D(_Pooling1D):
+class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
   Arguments:
@@ -119,8 +46,9 @@ class AveragePooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -165,7 +93,7 @@ def average_pooling1d(inputs, pool_size, strides,
 
 
 @tf_export('layers.MaxPooling1D')
-class MaxPooling1D(_Pooling1D):
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
   Arguments:
@@ -186,8 +114,9 @@ class MaxPooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -231,79 +160,8 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling2D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      pool_shape = (1,) + self.pool_size + (1,)
-      strides = (1,) + self.strides + (1,)
-    else:
-      pool_shape = (1, 1) + self.pool_size
-      strides = (1, 1) + self.strides
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, 4))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                    self.strides[0])
-    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                    self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-
 @tf_export('layers.AveragePooling2D')
-class AveragePooling2D(_Pooling2D):
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -328,8 +186,9 @@ class AveragePooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling2D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -373,7 +232,7 @@ def average_pooling2d(inputs,
 
 
 @tf_export('layers.MaxPooling2D')
-class MaxPooling2D(_Pooling2D):
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -398,8 +257,9 @@ class MaxPooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling2D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -442,90 +302,8 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 3D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling3D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 3, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def call(self, inputs):
-    pool_shape = (1,) + self.pool_size + (1,)
-    strides = (1,) + self.strides + (1,)
-
-    if self.data_format == 'channels_first':
-      # TF does not support `channels_first` with 3D pooling operations,
-      # so we must handle this case manually.
-      # TODO(fchollet): remove this when TF pooling is feature-complete.
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper())
-
-    if self.data_format == 'channels_first':
-      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
-                                        self.padding, self.strides[0])
-    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
-                                        self.padding, self.strides[1])
-    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
-                                        self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
-
-
 @tf_export('layers.AveragePooling3D')
-class AveragePooling3D(_Pooling3D):
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -552,8 +330,9 @@ class AveragePooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling3D, self).__init__(
-        nn.avg_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -599,7 +378,7 @@ def average_pooling3d(inputs,
 
 
 @tf_export('layers.MaxPooling3D')
-class MaxPooling3D(_Pooling3D):
+class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -626,8 +405,9 @@ class MaxPooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling3D, self).__init__(
-        nn.max_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index c941aad7bc63dbb891fbe78cd2a47dd6805bf231..7e94dda648166780af002ce6b979a751a0ced846 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -89,33 +88,5 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 
-class GraphUtilsTest(test.TestCase):
-
-  def testGetReachableFromInputs(self):
-
-    with self.test_session():
-      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
-      x_1 = pl_1 + pl_2
-      x_2 = pl_2 * 2
-      x_3 = pl_3 + 1
-      x_4 = x_1 + x_2
-      x_5 = x_3 * pl_1
-
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1]),
-          {pl_1, x_1, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1, pl_2]),
-          {pl_1, pl_2, x_1, x_2, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_3]),
-          {pl_3, x_3, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([x_3]),
-          {x_3, x_5})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fa26e07c8535d537e366e06902ce28e67d6e6dfe..ceeabe090dff9c7d945a494ba68b9b39e13df681 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -144,6 +144,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export("expand_dims")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -193,11 +194,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Raises:
     ValueError: if both `dim` and `axis` are specified.
   """
-  # TODO(aselle): Remove argument dim
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("can't specify both 'dim' and 'axis'")
-    axis = dim
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -2581,6 +2578,8 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
 
 
 @tf_export("squeeze")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
   # pylint: disable=redefined-builtin
   """Removes dimensions of size 1 from the shape of a tensor.
@@ -2621,10 +2620,8 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   Raises:
     ValueError: When both `squeeze_dims` and `axis` are specified.
   """
-  if squeeze_dims is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'squeeze_dims' and 'axis'")
-    axis = squeeze_dims
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "squeeze_dims", squeeze_dims)
   if np.isscalar(axis):
     axis = [axis]
   return gen_array_ops.squeeze(input, axis, name)
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index 5d68b47aeaef3a90973387ecd5b265eef1e96a5f..d83b81909755df8d187232e15ecda48b1cbf4557 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients_impl
@@ -39,7 +40,7 @@ from tensorflow.python.platform import test
 def batch_norm_op(tensor, mean, variance, beta, gamma, scale):
   """Fused kernel for batch normalization."""
   # _batch_norm_with_global_normalization is deprecated in v9
-  ops.get_default_graph().graph_def_versions.producer = 8
+  test_util.set_producer_version(ops.get_default_graph(), 8)
   # pylint: disable=protected-access
   return gen_nn_ops._batch_norm_with_global_normalization(
       tensor, mean, variance, beta, gamma, 0.001, scale)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 174d00987f9f76b4b07be73e5c29435bed7dfa06..2a2bcdd9d69b7a0aed1e7f3d3197cf6d7dd98451 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -115,7 +115,7 @@ class TreeEnsemble(object):
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
-    stamp_token, _, _, _ = (
+    stamp_token, _, _, _, _ = (
         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
             self.resource_handle))
     return stamp_token
@@ -124,17 +124,20 @@ class TreeEnsemble(object):
     """Returns states of the tree ensemble.
 
     Returns:
-      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers and
+      range of the nodes in the latest layer.
     """
-    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
-        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
-            self.resource_handle))
+    (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+     nodes_range) = (
+         gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+             self.resource_handle))
     # Use identity to give names.
     return (array_ops.identity(stamp_token, name='stamp_token'),
             array_ops.identity(num_trees, name='num_trees'),
             array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
             array_ops.identity(
-                num_attempted_layers, name='num_attempted_layers'))
+                num_attempted_layers, name='num_attempted_layers'),
+            array_ops.identity(nodes_range, name='last_layer_nodes_range'))
 
   def serialize(self):
     """Serializes the ensemble into proto and returns the serialized proto.
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 49f8c665313562cb20dbe4494103ded16646c741..0829aa67ed5236a7c2af89fc104f1d203c8a0f23 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -58,18 +60,34 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-
-    # Go through list of tensors, for each value in each tensor clip
-    t_min = math_ops.minimum(t, clip_value_max)
-    # Assert that the shape is compatible with the initial shape,
-    # to prevent unintentional broadcasting.
-    _ = t.shape.merge_with(t_min.shape)
-
-    t_max = math_ops.maximum(t_min, clip_value_min, name=name)
-    _ = t.shape.merge_with(t_max.shape)
-
-  return t_max
+    return gen_math_ops.clip_by_value(t,
+                                      clip_value_min,
+                                      clip_value_max,
+                                      name=name)
+
+@ops.RegisterGradient("ClipByValue")
+def _ClipByValueGrad(op, grad):
+  """Returns grad of clip_by_value."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  z = op.inputs[2]
+  gdtype = grad.dtype
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  sz = array_ops.shape(z)
+  gradshape = array_ops.shape(grad)
+  zeros = array_ops.zeros(gradshape, gdtype)
+  xymask = math_ops.less(x, y)
+  xzmask = math_ops.greater(x, z)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
+  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
+  ygrad = array_ops.where(xymask, grad, zeros)
+  zgrad = array_ops.where(xzmask, grad, zeros)
+  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
+  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
+  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
+  return (gx, gy, gz)
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 45955554cab130597e106660ff1fb4cdf7e9aeb1..6a551deb5ba55871b3a3fb144a6ecd2a3cbfcbd8 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
@@ -74,6 +75,11 @@ def _SwitchGrad(op, *grad):
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
+      # For resource variables we get None always on the other branch, so bypass
+      # this.
+      if op.inputs[0].dtype == dtypes.resource:
+        return merge(
+            [grad[op_ctxt.branch]] * 2, name="cond_resource_grad")[0], None
       return None, None
     return merge(grad, name="cond_grad")[0], None
   else:
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 710287012eadea08d6c5be51a8e1be6cce6a5f65..fb53d9ffea174ae4af6f664efb7e36f63e409124 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1598,6 +1598,16 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
@@ -3184,12 +3194,18 @@ def while_loop(cond,
         body = lambda i, lv: (i + 1, orig_body(*lv))
 
     if context.executing_eagerly():
+      try_to_pack = len(loop_vars) == 1
+      packed = False  # whether the body result was packed into a 1-item tuple
+
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
+        if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
+          packed = True
+          loop_vars = (loop_vars,)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
-        return loop_vars
+        return loop_vars[0] if packed else loop_vars
 
     if shape_invariants is not None:
       if maximum_iterations is not None:
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f22f3059d139d1bb7c7db57a2939184f1089f397..289df6f3016e9df6a42d694ae854b4f22fdf84f9 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -947,5 +947,28 @@ class CaseTest(test_util.TensorFlowTestCase):
         sess.run(output, feed_dict={x: 4})
 
 
+@test_util.with_c_api
+class WhileLoopTestCase(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testWhileLoopWithSingleVariable(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: math_ops.add(i, 1)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    self.assertEqual(self.evaluate(r), 10)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerWhileLoopWithSingleVariable_bodyReturnsTuple(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: (math_ops.add(i, 1),)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    # Expect a tuple since that is what the body returns.
+    self.assertEqual(self.evaluate(r), (10,))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index ed435557fde7a2e8a0a4f7eef4e240daef0565e7..4ebc600d034603a80d4fae93b1339e1a1feea038 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -23,7 +23,6 @@ import collections
 import contextlib
 import re
 
-import numpy as np
 import six
 
 from tensorflow.python.framework import dtypes
@@ -31,8 +30,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,23 +40,24 @@ __all__ = [
 
 
 class _Mapping(collections.namedtuple(
-    "_Mapping", ["x", "y", "ildj", "kwargs"])):
+    "_Mapping", ["x", "y", "ildj_map", "kwargs"])):
   """Helper class to make it easier to manage caching in `Bijector`."""
 
-  def __new__(cls, x=None, y=None, ildj=None, kwargs=None):
+  def __new__(cls, x=None, y=None, ildj_map=None, kwargs=None):
     """Custom __new__ so namedtuple items have defaults.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
 
     Returns:
       mapping: New instance of _Mapping.
     """
-    return super(_Mapping, cls).__new__(cls, x, y, ildj, kwargs)
+    return super(_Mapping, cls).__new__(cls, x, y, ildj_map, kwargs)
 
   @property
   def x_key(self):
@@ -69,13 +69,14 @@ class _Mapping(collections.namedtuple(
     """Returns key used for caching X=g^{-1}(Y)."""
     return (self.y,) + self._deep_tuple(tuple(sorted(self.kwargs.items())))
 
-  def merge(self, x=None, y=None, ildj=None, kwargs=None, mapping=None):
+  def merge(self, x=None, y=None, ildj_map=None, kwargs=None, mapping=None):
     """Returns new _Mapping with args merged with self.
 
     Args:
       x: `Tensor`. Forward.
       y: `Tensor`. Inverse.
-      ildj: `Tensor`. Inverse log det Jacobian.
+      ildj_map: `Dictionary`. This is a mapping from event_ndims to a `Tensor`
+        representing the inverse log det jacobian.
       kwargs: Python dictionary. Extra args supplied to
         forward/inverse/etc functions.
       mapping: Instance of _Mapping to merge. Can only be specified if no other
@@ -88,15 +89,30 @@ class _Mapping(collections.namedtuple(
       ValueError: if mapping and any other arg is not `None`.
     """
     if mapping is None:
-      mapping = _Mapping(x=x, y=y, ildj=ildj, kwargs=kwargs)
-    elif not all(arg is None for arg in [x, y, ildj, kwargs]):
-      raise ValueError("Cannot specify mapping and individual args.")
+      mapping = _Mapping(x=x, y=y, ildj_map=ildj_map, kwargs=kwargs)
+    elif any(arg is not None for arg in [x, y, ildj_map, kwargs]):
+      raise ValueError("Cannot simultaneously specify mapping and individual "
+                       "arguments.")
+
     return _Mapping(
         x=self._merge(self.x, mapping.x),
         y=self._merge(self.y, mapping.y),
-        ildj=self._merge(self.ildj, mapping.ildj),
+        ildj_map=self._merge_dicts(self.ildj_map, mapping.ildj_map),
         kwargs=self._merge(self.kwargs, mapping.kwargs))
 
+  def _merge_dicts(self, old=None, new=None):
+    """Helper to merge two dictionaries."""
+    old = dict() if old is None else old
+    new = dict() if new is None else new
+    for k, v in six.iteritems(new):
+      val = old.get(k, None)
+      if val is not None and val != v:
+        raise ValueError("Found different value for existing key "
+                         "(key:{} old_value:{} new_value:{}".format(
+                             k, old[k], v))
+      old[k] = v
+    return old
+
   def _merge(self, old, new):
     """Helper to merge which handles merging one value."""
     if old is None:
@@ -112,7 +128,6 @@ class _Mapping(collections.namedtuple(
 
 
 @six.add_metaclass(abc.ABCMeta)
-@tf_export("distributions.bijectors.Bijector")
 class Bijector(object):
   r"""Interface for transformations of a `Distribution` sample.
 
@@ -137,11 +152,11 @@ class Bijector(object):
   2. Inverse\
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-  3. `(log o det o Jacobian o inverse)(x)`\
+  3. `log_det_jacobian(x)`\
      "The log of the determinant of the matrix of all first-order partial
      derivatives of the inverse function."\
      Useful for inverting a transformation to compute one probability in terms
-     of another. Geometrically, the det(Jacobian) is the volume of the
+     of another. Geometrically, the Jacobian determinant is the volume of the
      transformation and is used to scale the probability.
 
   By convention, transformations of random variables are named in terms of the
@@ -164,7 +179,7 @@ class Bijector(object):
 
   ```python
   def transformed_log_prob(bijector, log_prob, x):
-    return (bijector.inverse_log_det_jacobian(x) +
+    return (bijector.inverse_log_det_jacobian(x, event_ndims=0) +
             log_prob(bijector.inverse(x)))
   ```
 
@@ -199,9 +214,11 @@ class Bijector(object):
     ```python
       class Exp(Bijector):
 
-        def __init__(self, event_ndims=0, validate_args=False, name="exp"):
+        def __init__(self, validate_args=False, name="exp"):
           super(Exp, self).__init__(
-              event_ndims=event_ndims, validate_args=validate_args, name=name)
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
 
         def _forward(self, x):
           return math_ops.exp(x)
@@ -213,10 +230,11 @@ class Bijector(object):
           return -self._forward_log_det_jacobian(self._inverse(y))
 
         def _forward_log_det_jacobian(self, x):
-          if self.event_ndims is None:
-            raise ValueError("Jacobian requires known event_ndims.")
-          event_dims = array_ops.shape(x)[-self.event_ndims:]
-          return math_ops.reduce_sum(x, axis=event_dims)
+          # Notice that we needn't do any reducing, even when`event_ndims > 0`.
+          # The base Bijector class will handle reducing for us; it knows how
+          # to do so because we called `super` `__init__` with
+          # `forward_min_event_ndims = 0`.
+          return x
       ```
 
   - "Affine"
@@ -237,18 +255,50 @@ class Bijector(object):
                   MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
       ```
 
-  #### Jacobian
+  #### Min_event_ndims and Naming
+
+  Bijectors are named for the dimensionality of data they act on (i.e. without
+  broadcasting). We can think of bijectors having an intrinsic `min_event_ndims`
+  , which is the minimum number of dimensions for the bijector act on. For
+  instance, a Cholesky decomposition requires a matrix, and hence
+  `min_event_ndims=2`.
+
+  Some examples:
+
+  `AffineScalar:  min_event_ndims=0`
+  `Affine:  min_event_ndims=1`
+  `Cholesky:  min_event_ndims=2`
+  `Exp:  min_event_ndims=0`
+  `Sigmoid:  min_event_ndims=0`
+  `SoftmaxCentered:  min_event_ndims=1`
+
+  Note the difference between `Affine` and `AffineScalar`. `AffineScalar`
+  operates on scalar events, whereas `Affine` operates on vector-valued events.
 
-  The Jacobian is a reduction over event dims. To see this, consider the `Exp`
-  `Bijector` applied to a `Tensor` which has sample, batch, and event (S, B, E)
-  shape semantics. Suppose the `Tensor`'s partitioned-shape is `(S=[4], B=[2],
-  E=[3, 3])`. The shape of the `Tensor` returned by `forward` and `inverse` is
-  unchanged, i.e., `[4, 2, 3, 3]`.  However the shape returned by
-  `inverse_log_det_jacobian` is `[4, 2]` because the Jacobian is a reduction
-  over the event dimensions.
+  More generally, there is a `forward_min_event_ndims` and an
+  `inverse_min_event_ndims`. In most cases, these will be the same.
+  However, for some shape changing bijectors, these will be different
+  (e.g. a bijector which pads an extra dimension at the end, might have
+  `forward_min_event_ndims=0` and `inverse_min_event_ndims=1`.
 
-  It is sometimes useful to implement the inverse Jacobian as the negative
-  forward Jacobian. For example,
+
+  #### Jacobian Determinant
+
+  The Jacobian determinant is a reduction over `event_ndims - min_event_ndims`
+  (`forward_min_event_ndims` for `forward_log_det_jacobian` and
+  `inverse_min_event_ndims` for `inverse_log_det_jacobian`).
+  To see this, consider the `Exp` `Bijector` applied to a `Tensor` which has
+  sample, batch, and event (S, B, E) shape semantics. Suppose the `Tensor`'s
+  partitioned-shape is `(S=[4], B=[2], E=[3, 3])`. The shape of the `Tensor`
+  returned by `forward` and `inverse` is unchanged, i.e., `[4, 2, 3, 3]`.
+  However the shape returned by `inverse_log_det_jacobian` is `[4, 2]` because
+  the Jacobian determinant is a reduction over the event dimensions.
+
+  Another example is the `Affine` `Bijector`. Because `min_event_ndims = 1`, the
+  Jacobian determinant reduction is over `event_ndims - 1`.
+
+  It is sometimes useful to implement the inverse Jacobian determinant as the
+  negative forward Jacobian determinant. For example,
 
   ```python
   def _inverse_log_det_jacobian(self, y):
@@ -279,9 +329,54 @@ class Bijector(object):
       The claim follows from [properties of determinant](
   https://en.wikipedia.org/wiki/Determinant#Multiplicativity_and_matrix_groups).
 
-  Generally its preferable to directly implement the inverse Jacobian. This
-  should have superior numerical stability and will often share subgraphs with
-  the `_inverse` implementation.
+  Generally its preferable to directly implement the inverse Jacobian
+  determinant.  This should have superior numerical stability and will often
+  share subgraphs with the `_inverse` implementation.
+
+  #### Is_constant_jacobian
+
+  Certain bijectors will have constant jacobian matrices. For instance, the
+  `Affine` bijector encodes multiplication by a matrix plus a shift, with
+  jacobian matrix, the same aforementioned matrix.
+
+  `is_constant_jacobian` encodes the fact that the jacobian matrix is constant.
+  The semantics of this argument are the following:
+
+    * Repeated calls to "log_det_jacobian" functions with the same
+      `event_ndims` (but not necessarily same input), will return the first
+      computed jacobian (because the matrix is constant, and hence is input
+      independent).
+    * `log_det_jacobian` implementations are merely broadcastable to the true
+      `log_det_jacobian` (because, again, the jacobian matrix is input
+      independent). Specifically, `log_det_jacobian` is implemented as the
+      log jacobian determinant for a single input.
+
+      ```python
+      class Identity(Bijector):
+
+        def __init__(self, validate_args=False, name="identity"):
+          super(Identity, self).__init__(
+              is_constant_jacobian=True,
+              validate_args=validate_args,
+              forward_min_event_ndims=0,
+              name=name)
+
+        def _forward(self, x):
+          return x
+
+        def _inverse(self, y):
+          return y
+
+        def _inverse_log_det_jacobian(self, y):
+          return -self._forward_log_det_jacobian(self._inverse(y))
+
+        def _forward_log_det_jacobian(self, x):
+          # The full log jacobian determinant would be array_ops.zero_like(x).
+          # However, we circumvent materializing that, since the jacobian
+          # calculation is input independent, and we specify it for one input.
+          return constant_op.constant(0., x.dtype.base_dtype)
+
+      ```
 
   #### Subclass Requirements
 
@@ -364,14 +459,14 @@ class Bijector(object):
   ==> (-1., 1.)
 
   # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
+  abs.inverse_log_det_jacobian(1., event_ndims=0)
   ==> (0., 0.)
 
   # Special case handling of 0.
   abs.inverse(0.)
   ==> (0., 0.)
 
-  abs.inverse_log_det_jacobian(0.)
+  abs.inverse_log_det_jacobian(0., event_ndims=0)
   ==> (0., 0.)
   ```
 
@@ -379,11 +474,12 @@ class Bijector(object):
 
   @abc.abstractmethod
   def __init__(self,
-               event_ndims=None,
                graph_parents=None,
                is_constant_jacobian=False,
                validate_args=False,
                dtype=None,
+               forward_min_event_ndims=None,
+               inverse_min_event_ndims=None,
                name=None):
     """Constructs Bijector.
 
@@ -392,42 +488,61 @@ class Bijector(object):
     Examples:
 
     ```python
-    # Create the Y = g(X) = X transform which operates on vector events.
-    identity = Identity(event_ndims=1)
+    # Create the Y = g(X) = X transform.
+    identity = Identity()
 
-    # Create the Y = g(X) = exp(X) transform which operates on matrices.
-    exp = Exp(event_ndims=2)
+    # Create the Y = g(X) = exp(X) transform.
+    exp = Exp()
     ```
 
     See `Bijector` subclass docstring for more details and specific examples.
 
     Args:
-      event_ndims: number of dimensions associated with event coordinates.
       graph_parents: Python list of graph prerequisites of this `Bijector`.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is not a
-        function of the input.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian matrix is
+        not a function of the input.
       validate_args: Python `bool`, default `False`. Whether to validate input
         with asserts. If `validate_args` is `False`, and the inputs are invalid,
         correct behavior is not guaranteed.
       dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
         enforced.
+      forward_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `forward` operates on.
+      inverse_min_event_ndims: Python `integer` indicating the minimum number of
+        dimensions `inverse` operates on. Will be set to
+        `forward_min_event_ndims` by default, if no value is provided.
       name: The name to give Ops created by the initializer.
 
     Raises:
+      ValueError:  If neither `forward_min_event_ndims` and
+        `inverse_min_event_ndims` are specified, or if either of them is
+        negative.
       ValueError:  If a member of `graph_parents` is not a `Tensor`.
     """
-    self._event_ndims = (
-        ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
-        if event_ndims is not None else None)
     self._graph_parents = graph_parents or []
+
+    if forward_min_event_ndims is None and inverse_min_event_ndims is None:
+      raise ValueError("Must specify at least one of `forward_min_event_ndims` "
+                       "and `inverse_min_event_ndims`.")
+    elif inverse_min_event_ndims is None:
+      inverse_min_event_ndims = forward_min_event_ndims
+    elif forward_min_event_ndims is None:
+      forward_min_event_ndims = inverse_min_event_ndims
+
+    if forward_min_event_ndims < 0:
+      raise ValueError("forward_min_event_ndims must be a non-negative "
+                       "integer.")
+    if inverse_min_event_ndims < 0:
+      raise ValueError("inverse_min_event_ndims must be a non-negative "
+                       "integer.")
+    self._forward_min_event_ndims = forward_min_event_ndims
+    self._inverse_min_event_ndims = inverse_min_event_ndims
     self._is_constant_jacobian = is_constant_jacobian
+    self._constant_ildj_map = {}
     self._validate_args = validate_args
     self._dtype = dtype
     self._from_y = {}
     self._from_x = {}
-    # Using abbreviation ildj for "inverse log det Jacobian."
-    # This variable is not `None` iff is_constant_jacobian is `True`.
-    self._constant_ildj = None
     if name:
       self._name = name
     else:
@@ -442,21 +557,27 @@ class Bijector(object):
       if t is None or not tensor_util.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
 
-  @property
-  def event_ndims(self):
-    """Returns then number of event dimensions this bijector operates on."""
-    return self._event_ndims
-
   @property
   def graph_parents(self):
     """Returns this `Bijector`'s graph_parents as a Python list."""
     return self._graph_parents
 
+  @property
+  def forward_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.forward operates on."""
+    return self._forward_min_event_ndims
+
+  @property
+  def inverse_min_event_ndims(self):
+    """Returns the minimal number of dimensions bijector.inverse operates on."""
+    return self._inverse_min_event_ndims
+
   @property
   def is_constant_jacobian(self):
-    """Returns true iff the Jacobian is not a function of x.
+    """Returns true iff the Jacobian matrix is not a function of x.
 
-    Note: Jacobian is either constant for both forward and inverse or neither.
+    Note: Jacobian matrix is either constant for both forward and inverse or
+    neither.
 
     Returns:
       is_constant_jacobian: Python `bool`.
@@ -653,36 +774,57 @@ class Bijector(object):
     return self._call_inverse(y, name)
 
   def _inverse_log_det_jacobian(self, y):
-    """Subclass implementation of `inverse_log_det_jacobian` public function."""
+    """Subclass implementation of `inverse_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `inverse_min_event_ndims`).
+
+    Args:
+      y: `Tensor`. The input to the "inverse_log_det_jacobian" evaluation.
+    Returns:
+      inverse_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
     raise NotImplementedError("inverse_log_det_jacobian not implemented.")
 
-  def _call_inverse_log_det_jacobian(self, y, name, **kwargs):
+  def _call_inverse_log_det_jacobian(self, y, event_ndims, name, **kwargs):
     with self._name_scope(name, [y]):
-      if self._constant_ildj is not None:
-        return self._constant_ildj
+      if event_ndims in self._constant_ildj_map:
+        return self._constant_ildj_map[event_ndims]
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
       if not self._is_injective:  # No caching for non-injective
-        return self._inverse_log_det_jacobian(y, **kwargs)
+        ildjs = self._inverse_log_det_jacobian(y, **kwargs)
+        return tuple(self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
+                     for ildj in ildjs)
       mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return mapping.ildj_map[event_ndims]
       try:
         x = None  # Not needed; leave cache as is.
         ildj = self._inverse_log_det_jacobian(y, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            y, ildj, self.inverse_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           x = mapping.x if mapping.x is not None else self._inverse(y, **kwargs)
           ildj = -self._forward_log_det_jacobian(x, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              x, ildj, self.forward_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(x=x, ildj=ildj)
+
+      mapping = mapping.merge(x=x, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return ildj
 
-  def inverse_log_det_jacobian(self, y, name="inverse_log_det_jacobian"):
+  def inverse_log_det_jacobian(
+      self, y, event_ndims, name="inverse_log_det_jacobian"):
     """Returns the (log o det o Jacobian o inverse)(y).
 
     Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
@@ -691,7 +833,12 @@ class Bijector(object):
     evaluated at `g^{-1}(y)`.
 
     Args:
-      y: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      y: `Tensor`. The input to the "inverse" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.inverse_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `y.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -705,45 +852,74 @@ class Bijector(object):
         `self.dtype`.
       NotImplementedError: if `_inverse_log_det_jacobian` is not implemented.
     """
-    return self._call_inverse_log_det_jacobian(y, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.inverse_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_inverse_log_det_jacobian(y, event_ndims, name)
 
   def _forward_log_det_jacobian(self, x):
-    """Subclass implementation of `forward_log_det_jacobian`."""
+    """Subclass implementation of `forward_log_det_jacobian` public function.
+
+    In particular, this method differs from the public function, in that it
+    does not take `event_ndims`. Thus, this implements the minimal Jacobian
+    determinant calculation (i.e. over `forward_min_event_ndims`).
+
+    Args:
+      x: `Tensor`. The input to the "forward_log_det_jacobian" evaluation.
+
+    Returns:
+      forward_log_det_jacobian: `Tensor`, if this bijector is injective.
+        If not injective, returns the k-tuple containing jacobians for the
+        unique `k` points `(x1, ..., xk)` such that `g(xi) = y`.
+    """
+
     raise NotImplementedError(
         "forward_log_det_jacobian not implemented.")
 
-  def _call_forward_log_det_jacobian(self, x, name, **kwargs):
+  def _call_forward_log_det_jacobian(self, x, event_ndims, name, **kwargs):
     with self._name_scope(name, [x]):
-      if self._constant_ildj is not None:
+      if event_ndims in self._constant_ildj_map:
         # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
-        return -1. * self._constant_ildj
+        return -1. * self._constant_ildj_map[event_ndims]
       x = ops.convert_to_tensor(x, name="x")
       self._maybe_assert_dtype(x)
       if not self._is_injective:
-        return self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        fldjs = self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+        return tuple(self._reduce_jacobian_det_over_event(
+            x, fldj, self.forward_min_event_ndims, event_ndims)
+                     for fldj in fldjs)
       mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.ildj is not None:
-        return -mapping.ildj
+      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+        return -mapping.ildj_map[event_ndims]
       try:
         y = None  # Not needed; leave cache as is.
         ildj = -self._forward_log_det_jacobian(x, **kwargs)
+        ildj = self._reduce_jacobian_det_over_event(
+            x, ildj, self.forward_min_event_ndims, event_ndims)
       except NotImplementedError as original_exception:
         try:
           y = mapping.y if mapping.y is not None else self._forward(x, **kwargs)
           ildj = self._inverse_log_det_jacobian(y, **kwargs)
+          ildj = self._reduce_jacobian_det_over_event(
+              y, ildj, self.inverse_min_event_ndims, event_ndims)
         except NotImplementedError:
           raise original_exception
-      mapping = mapping.merge(y=y, ildj=ildj)
+      mapping = mapping.merge(y=y, ildj_map={event_ndims: ildj})
       self._cache(mapping)
       if self.is_constant_jacobian:
-        self._constant_ildj = mapping.ildj
-      return -mapping.ildj
+        self._constant_ildj_map[event_ndims] = ildj
+      return -ildj
 
-  def forward_log_det_jacobian(self, x, name="forward_log_det_jacobian"):
+  def forward_log_det_jacobian(
+      self, x, event_ndims, name="forward_log_det_jacobian"):
     """Returns both the forward_log_det_jacobian.
 
     Args:
-      x: `Tensor`. The input to the "forward" Jacobian evaluation.
+      x: `Tensor`. The input to the "forward" Jacobian determinant evaluation.
+      event_ndims: Number of dimensions in the probabilistic events being
+        transformed. Must be greater than or equal to
+        `self.forward_min_event_ndims`. The result is summed over the final
+        dimensions to produce a scalar Jacobian determinant for each event,
+        i.e. it has shape `x.shape.ndims - event_ndims` dimensions.
       name: The name to give this op.
 
     Returns:
@@ -761,7 +937,9 @@ class Bijector(object):
       raise NotImplementedError(
           "forward_log_det_jacobian cannot be implemented for non-injective "
           "transforms.")
-    return self._call_forward_log_det_jacobian(x, name)
+    with ops.control_dependencies(self._check_valid_event_ndims(
+        min_event_ndims=self.forward_min_event_ndims, event_ndims=event_ndims)):
+      return self._call_forward_log_det_jacobian(x, event_ndims, name)
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -779,9 +957,6 @@ class Bijector(object):
 
   def _cache(self, mapping):
     """Helper which stores mapping info in forward/inverse dicts."""
-    if self._constant_ildj is not None:
-      # Fold in ildj if known constant Jacobian.
-      mapping = mapping.merge(ildj=self._constant_ildj)
     # Merging from lookup is an added check that we're not overwriting anything
     # which is not None.
     mapping = mapping.merge(mapping=self._lookup(
@@ -803,22 +978,66 @@ class Bijector(object):
       return self._from_y.get(mapping.y_key, mapping)
     return mapping
 
-  def _event_dims_tensor(self, sample):
-    """Return a 1D `int32` tensor: `range(rank(sample))[-event_ndims:]`."""
-    if self.event_ndims is None:
-      raise ValueError("Jacobian cannot be computed with unknown event_ndims")
-    static_event_ndims = tensor_util.constant_value(self.event_ndims)
-    static_rank = sample.get_shape().ndims
-    if static_event_ndims is not None and static_rank is not None:
-      return ops.convert_to_tensor(
-          static_rank + np.arange(-static_event_ndims, 0).astype(np.int32))
-
-    if static_event_ndims is not None:
-      event_range = np.arange(-static_event_ndims, 0).astype(np.int32)
-    else:
-      event_range = math_ops.range(-self.event_ndims, 0, dtype=dtypes.int32)
-
-    if static_rank is not None:
-      return event_range + static_rank
+  def _reduce_jacobian_det_over_event(
+      self, y, ildj, min_event_ndims, event_ndims):
+    """Reduce jacobian over event_ndims - min_event_ndims."""
+    if not self.is_constant_jacobian:
+      return math_ops.reduce_sum(
+          ildj,
+          self._get_event_reduce_dims(min_event_ndims, event_ndims))
+
+    # In this case, we need to tile the jacobian over the event and reduce.
+    y_rank = array_ops.rank(y)
+    y_shape = array_ops.shape(y)[
+        y_rank - event_ndims : y_rank - min_event_ndims]
+
+    ones = array_ops.ones(y_shape, ildj.dtype)
+    reduced_ildj = math_ops.reduce_sum(
+        ones * ildj,
+        axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
+    # The multiplication by ones can change the inferred static shape so we try
+    # to recover as much as possible.
+    if (isinstance(event_ndims, int) and
+        y.get_shape().ndims and ildj.get_shape().ndims):
+      y_shape = y.get_shape()
+      y_shape = y_shape[y_shape.ndims - event_ndims :
+                        y_shape.ndims - min_event_ndims]
+      ildj_shape = ildj.get_shape()
+      broadcast_shape = array_ops.broadcast_static_shape(
+          ildj_shape, y_shape)
+      reduced_ildj.set_shape(
+          broadcast_shape[: broadcast_shape.ndims - (
+              event_ndims - min_event_ndims)])
+
+    return reduced_ildj
+
+  def _get_event_reduce_dims(self, min_event_ndims, event_ndims):
+    """Compute the reduction dimensions given event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      return [-index for index in range(1, event_ndims_ - min_event_ndims_ + 1)]
     else:
-      return event_range + array_ops.rank(sample)
+      reduce_ndims = event_ndims - min_event_ndims
+      return math_ops.range(-reduce_ndims, 0)
+
+  def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
+    """Check whether event_ndims is atleast min_event_ndims."""
+    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
+                        else tensor_util.constant_value(min_event_ndims))
+    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
+                    else tensor_util.constant_value(event_ndims))
+
+    if min_event_ndims_ is not None and event_ndims_ is not None:
+      if min_event_ndims_ > event_ndims_:
+        raise ValueError("event_ndims ({}) must be larger than "
+                         "min_event_ndims ({})".format(
+                             event_ndims_, min_event_ndims_))
+      return []
+
+    if self.validate_args:
+      return [check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
+    return []
diff --git a/tensorflow/python/ops/distributions/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
index ff3535c62642d98bdd9b18808f45deae27d6d88d..784bfd58352f4035cd1bd4caa91eba6e6dc8d30d 100644
--- a/tensorflow/python/ops/distributions/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -79,9 +79,7 @@ def assert_scalar_congruency(bijector,
   Raises:
     AssertionError:  If tests fail.
   """
-
   # Checks and defaults.
-  assert bijector.event_ndims.eval() == 0
   if sess is None:
     sess = ops.get_default_session()
 
@@ -111,7 +109,10 @@ def assert_scalar_congruency(bijector,
   # (b - a) = \int_a^b dx = \int_{y(a)}^{y(b)} |dx/dy| dy
   # "change_measure_dy_dx" below is a Monte Carlo approximation to the right
   # hand side, which should then be close to the left, which is (b - a).
-  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(uniform_y_samps))
+  # We assume event_ndims=0 because we assume scalar -> scalar. The log_det
+  # methods will handle whether they expect event_ndims > 0.
+  dy_dx = math_ops.exp(bijector.inverse_log_det_jacobian(
+      uniform_y_samps, event_ndims=0))
   # E[|dx/dy|] under Uniform[lower_y, upper_y]
   # = \int_{y(a)}^{y(b)} |dx/dy| dP(u), where dP(u) is the uniform measure
   expectation_of_dy_dx_under_uniform = math_ops.reduce_mean(dy_dx)
@@ -121,7 +122,8 @@ def assert_scalar_congruency(bijector,
 
   # We'll also check that dy_dx = 1 / dx_dy.
   dx_dy = math_ops.exp(
-      bijector.forward_log_det_jacobian(bijector.inverse(uniform_y_samps)))
+      bijector.forward_log_det_jacobian(
+          bijector.inverse(uniform_y_samps), event_ndims=0))
 
   [
       forward_on_10_pts_v,
@@ -158,7 +160,8 @@ def assert_scalar_congruency(bijector,
       dy_dx_v, np.divide(1., dx_dy_v), atol=1e-5, rtol=1e-3)
 
 
-def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
+def assert_bijective_and_finite(
+    bijector, x, y, event_ndims, atol=0, rtol=1e-5, sess=None):
   """Assert that forward/inverse (along with jacobians) are inverses and finite.
 
   It is recommended to use x and y values that are very very close to the edge
@@ -168,6 +171,8 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
     bijector:  A Bijector instance.
     x:  np.array of values in the domain of bijector.forward.
     y:  np.array of values in the domain of bijector.inverse.
+    event_ndims: Integer describing the number of event dimensions this bijector
+      operates on.
     atol:  Absolute tolerance.
     rtol:  Relative tolerance.
     sess:  TensorFlow session.  Defaults to the default session.
@@ -197,10 +202,10 @@ def assert_bijective_and_finite(bijector, x, y, atol=0, rtol=1e-5, sess=None):
   ] = sess.run([
       bijector.inverse(f_x),
       bijector.forward(g_y),
-      bijector.inverse_log_det_jacobian(f_x),
-      bijector.forward_log_det_jacobian(x),
-      bijector.inverse_log_det_jacobian(y),
-      bijector.forward_log_det_jacobian(g_y),
+      bijector.inverse_log_det_jacobian(f_x, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(x, event_ndims=event_ndims),
+      bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims),
+      bijector.forward_log_det_jacobian(g_y, event_ndims=event_ndims),
       f_x,
       g_y,
   ])
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 9161e3fa9f5f7f844e7f4926992c954acae246d6..995dd9ca2ad39ef36b473050f896315a9372929a 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -311,7 +311,7 @@ class Categorical(distribution.Distribution):
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
 
   def _mode(self):
-    ret = math_ops.argmax(self.logits, dimension=self._batch_rank)
+    ret = math_ops.argmax(self.logits, axis=self._batch_rank)
     ret = math_ops.cast(ret, self.dtype)
     ret.set_shape(self.batch_shape)
     return ret
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
index 9df7d148a583e533475276e090bcb02cb705290f..7c4b8697d818a88000fc2e84aec461d2918386a6 100644
--- a/tensorflow/python/ops/distributions/distributions.py
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions import bijectors
 from tensorflow.python.ops.distributions.bernoulli import Bernoulli
 from tensorflow.python.ops.distributions.beta import Beta
 from tensorflow.python.ops.distributions.categorical import Categorical
@@ -40,7 +39,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
-    "bijectors",
     "Bernoulli",
     "Beta",
     "Categorical",
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
index 2972c3554b3639a1ae30a4167f73613b1ff8add2..8628e68f967337fb81187bae9576a168e1cd5a36 100644
--- a/tensorflow/python/ops/distributions/identity_bijector.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -28,7 +27,6 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.bijectors.Identity")
 class Identity(bijector.Bijector):
   """Compute Y = g(X) = X.
 
@@ -37,7 +35,7 @@ class Identity(bijector.Bijector):
     ```python
     # Create the Y=g(X)=X transform which is intended for Tensors with 1 batch
     # ndim and 1 event ndim (i.e., vector of vectors).
-    identity = Identity(event_ndims=1)
+    identity = Identity()
     x = [[1., 2],
          [3, 4]]
     x == identity.forward(x) == identity.inverse(x)
@@ -45,10 +43,10 @@ class Identity(bijector.Bijector):
 
   """
 
-  def __init__(self, validate_args=False, event_ndims=0, name="identity"):
+  def __init__(self, validate_args=False, name="identity"):
     super(Identity, self).__init__(
+        forward_min_event_ndims=0,
         is_constant_jacobian=True,
-        event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
 
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1efcf9d32e9ea9924bb080459efb7015e33ccd54..1ad63a8cf6633302375e948c2d3892e9578c05a5 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -197,8 +197,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
       shift=-1.,
-      scale_identity_multiplier=2.,
-      event_ndims=0),
+      scale_identity_multiplier=2.)
     name="NormalTransformedDistribution")
   ```
 
@@ -419,48 +418,51 @@ class TransformedDistribution(distribution_lib.Distribution):
     # For caching to work, it is imperative that the bijector is the first to
     # modify the input.
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_log_prob_for_one_fiber(y, x, ildj)
+      return self._finish_log_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     lp_on_fibers = [
-        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_log_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return math_ops.reduce_logsumexp(array_ops.stack(lp_on_fibers), axis=0)
 
-  def _finish_log_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_log_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of log_prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
     log_prob += math_ops.cast(ildj, log_prob.dtype)
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       log_prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          x.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return log_prob
 
   def _prob(self, y):
     x = self.bijector.inverse(y)
-    ildj = self.bijector.inverse_log_det_jacobian(y)
+    event_ndims = self._maybe_get_event_ndims_statically()
+    ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
-      return self._finish_prob_for_one_fiber(y, x, ildj)
+      return self._finish_prob_for_one_fiber(y, x, ildj, event_ndims)
 
     prob_on_fibers = [
-        self._finish_prob_for_one_fiber(y, x_i, ildj_i)
+        self._finish_prob_for_one_fiber(y, x_i, ildj_i, event_ndims)
         for x_i, ildj_i in zip(x, ildj)]
     return sum(prob_on_fibers)
 
-  def _finish_prob_for_one_fiber(self, y, x, ildj):
+  def _finish_prob_for_one_fiber(self, y, x, ildj, event_ndims):
     """Finish computation of prob on one element of the inverse image."""
     x = self._maybe_rotate_dims(x, rotate_right=True)
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
     prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
-    if self._is_maybe_event_override:
+    if self._is_maybe_event_override and isinstance(event_ndims, int):
       prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
+          y.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
     return prob
 
   def _log_cdf(self, y):
@@ -545,10 +547,17 @@ class TransformedDistribution(distribution_lib.Distribution):
           _ones_like(self.distribution.batch_shape_tensor())
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
-    dummy = array_ops.zeros([], self.dtype)
-    entropy -= math_ops.cast(
-        self.bijector.inverse_log_det_jacobian(dummy),
-        entropy.dtype)
+    dummy = array_ops.zeros(
+        shape=array_ops.concat(
+            [self.batch_shape_tensor(), self.event_shape_tensor()],
+            0),
+        dtype=self.dtype)
+    event_ndims = (self.event_shape.ndims if self.event_shape.ndims is not None
+                   else array_ops.size(self.event_shape_tensor()))
+    ildj = self.bijector.inverse_log_det_jacobian(
+        dummy, event_ndims=event_ndims)
+
+    entropy -= math_ops.cast(ildj, entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
@@ -610,3 +619,16 @@ class TransformedDistribution(distribution_lib.Distribution):
     n = (ndims - self._rotate_ndims) if rotate_right else self._rotate_ndims
     return array_ops.transpose(
         x, _concat_vectors(math_ops.range(n, ndims), math_ops.range(0, n)))
+
+  def _maybe_get_event_ndims_statically(self):
+    if self.event_shape.ndims is not None:
+      return self.event_shape.ndims
+
+    event_ndims = array_ops.size(self.event_shape_tensor())
+
+    static_event_ndims = tensor_util.constant_value(event_ndims)
+
+    if static_event_ndims is not None:
+      return static_event_ndims
+
+    return event_ndims
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 0fe6aa30f945dc7682a53fa6495823288cf111b7..2e067eab459050e30d220bdb7ff0d65cb9c552f7 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -58,8 +58,7 @@ def assert_close(
   if data is None:
     data = [
         message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
+        "Condition x ~= y did not hold element-wise: x = ", x, "y = ", y
     ]
 
   if x.dtype.is_integer:
@@ -95,7 +94,7 @@ def assert_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     if x.dtype.is_integer:
       return control_flow_ops.no_op()
-    message = message or "{} has non-integer components".format(x.op.name)
+    message = message or "{} has non-integer components".format(x)
     if int_dtype is None:
       try:
         int_dtype = {
@@ -123,13 +122,13 @@ def embed_check_nonnegative_integer_form(
     x = ops.convert_to_tensor(x, name="x")
     assertions = [
         check_ops.assert_non_negative(
-            x, message="'{}' must be non-negative.".format(x.op.name)),
+            x, message="'{}' must be non-negative.".format(x)),
     ]
     if not x.dtype.is_integer:
       assertions += [
           assert_integer_form(
               x, message="'{}' cannot contain fractional components.".format(
-                  x.op.name)),
+                  x)),
       ]
     return control_flow_ops.with_dependencies(assertions, x)
 
@@ -434,7 +433,7 @@ def embed_check_integer_casting_closed(
         and not _is_integer_like_by_dtype(target_dtype)):
       raise TypeError("At least one of {}.dtype ({}) and target_dtype ({}) "
                       "must be integer-type.".format(
-                          x.op.name, x.dtype.name, target_dtype.name))
+                          x, x.dtype.name, target_dtype.name))
 
     assertions = []
     if assert_nonnegative:
@@ -683,7 +682,7 @@ def pick_vector(cond,
     cond = ops.convert_to_tensor(cond, name="cond")
     if cond.dtype != dtypes.bool:
       raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
+                      (cond, cond.dtype, dtypes.bool))
     cond_value_static = tensor_util.constant_value(cond)
     if cond_value_static is not None:
       return true_vector if cond_value_static else false_vector
@@ -692,8 +691,8 @@ def pick_vector(cond,
     if true_vector.dtype != false_vector.dtype:
       raise TypeError(
           "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
+          % (true_vector, true_vector.dtype,
+             false_vector, false_vector.dtype))
     n = array_ops.shape(true_vector)[0]
     return array_ops.slice(
         array_ops.concat([true_vector, false_vector], 0),
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f0120f2957db12caf6a513fde9aa8c756aff8bad..9e46739bc1b6899b9b1fea898254f6e61fb7b8ba 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -331,11 +331,11 @@ def embedding_lookup_sparse(params,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
       element must be appropriately sized for the given `partition_strategy`.
-    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+    sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId),
       where N is typically batch size and M is arbitrary.
-    sp_weights: either a SparseTensor of float / double weights, or None to
-      indicate all weights should be taken to be 1. If specified, sp_weights
-      must have exactly the same shape and indices as sp_ids.
+    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
+      indicate all weights should be taken to be 1. If specified, `sp_weights`
+      must have exactly the same shape and indices as `sp_ids`.
     partition_strategy: A string specifying the partitioning strategy, relevant
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`. See `tf.nn.embedding_lookup` for more details.
@@ -351,39 +351,43 @@ def embedding_lookup_sparse(params,
 
   Returns:
     A dense tensor representing the combined embeddings for the
-    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
     looks up the embeddings for all ids in that row, multiplies them by the
     corresponding weight, and combines these embeddings as specified.
 
     In other words, if
 
-      shape(combined params) = [p0, p1, ..., pm]
+      `shape(combined params) = [p0, p1, ..., pm]`
 
     and
 
-      shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
 
     then
 
-      shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].
+      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
 
     For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
 
+      ```python
       [0, 0]: id 1, weight 2.0
       [0, 1]: id 3, weight 0.5
       [1, 0]: id 0, weight 1.0
       [2, 3]: id 1, weight 3.0
+      ```
 
     with `combiner`="mean", then the output will be a 3x20 matrix where
 
+      ```python
       output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
       output[1, :] = (params[0, :] * 1.0) / 1.0
       output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
 
   Raises:
-    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
-      None nor SparseTensor.
-    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is 
+      neither `None` nor `SparseTensor`.
+    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
   if combiner is None:
     logging.warn("The default value of combiner will change from \"mean\" "
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 44473ec69c8ac6cf565f635621eebff7bc403225..13420b7f0ee5f2c186ff99409588b827b281c95f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -208,7 +208,10 @@ def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
   """Fill in default values for grad_ys.
 
   Args:
@@ -216,6 +219,9 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -231,7 +237,7 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -338,10 +344,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_gradients_with_ops):
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops.colocate_with(op):
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
       yield
   else:
     yield
@@ -506,6 +512,9 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -513,7 +522,8 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -570,10 +580,11 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
@@ -633,7 +644,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops.colocate_with(None, ignore_existing=True):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -789,7 +803,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list):
+def _MultiDeviceAddN(tensor_list, gradient_uid):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -808,7 +822,10 @@ def _MultiDeviceAddN(tensor_list):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops.colocate_with(tensors[0].op, ignore_existing=True):
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -834,12 +851,19 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -916,7 +940,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad)
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index c94f1396b28e2124c6e5123cf711ac86abf174ab..0603d3b6706b960a0fa9d9b33d383dd0c9063780 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
@@ -810,5 +811,29 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
       gradients.gradients(y, x)
 
 
+class ResourceCondTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    gamma = resource_variable_ops.ResourceVariable(
+        np.random.random((3,)),
+        dtype="float32", name="gamma")
+
+    inputs = array_ops.ones(shape=(3,), dtype="float32")
+
+    def TestFn():
+      output = inputs + gamma
+      return output
+
+    training = array_ops.placeholder_with_default(True, shape=())
+    output = control_flow_ops.cond(
+        training, TestFn, lambda: inputs)
+
+    loss = output
+
+    grads = gradients.gradients(
+        loss, [gamma])
+    self.assertTrue(None not in grads)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1217e984c8f6723ca171b0fdaa9fa8aed43d75a
--- /dev/null
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -0,0 +1,395 @@
+# array_ops
+BatchToSpace
+BroadcastArgs
+BroadcastGradientArgs
+ConcatOffset
+Concat
+ConcatV2
+ConjugateTranspose
+Const
+DebugGradientIdentity
+DebugGradientRefIdentity
+EditDistance
+ExpandDims
+ListDiff
+MirrorPad
+MirrorPadGrad
+OneHot
+Pack
+Pad
+PadV2
+ParallelConcat
+Placeholder
+RefIdentity
+Reverse
+Snapshot
+SpaceToBatch
+Split
+SplitV
+Squeeze
+Slice
+TileGrad  # Exported through array_grad instead of array_ops.
+ZerosLike  # TODO(josh11b): Use this instead of the Python version.
+Unique
+UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
+Unpack
+
+# candidate_sampling_ops
+AllCandidateSampler
+ComputeAccidentalHits
+FixedUnigramCandidateSampler
+LearnedUnigramCandidateSampler
+LogUniformCandidateSampler
+ThreadUnsafeUnigramCandidateSampler
+UniformCandidateSampler
+
+# checkpoint_ops
+GenerateVocabRemapping
+LoadAndRemapMatrix
+
+
+# control_flow_ops
+Switch
+Merge
+RefMerge
+Exit
+RefExit
+
+# ctc_ops
+CTCLoss
+CTCGreedyDecoder
+CTCBeamSearchDecoder
+
+# data_flow_ops
+Barrier
+BarrierClose
+BarrierIncompleteSize
+BarrierInsertMany
+BarrierReadySize
+BarrierTakeMany
+DeleteSessionTensor
+FakeQueue
+FIFOQueue
+FIFOQueueV2
+GetSessionHandle
+GetSessionHandleV2
+GetSessionTensor
+HashTable
+HashTableV2
+InitializeTable
+InitializeTableV2
+InitializeTableFromTextFile
+InitializeTableFromTextFileV2
+LookupTableExport
+LookupTableExportV2
+LookupTableFind
+LookupTableFindV2
+LookupTableImport
+LookupTableImportV2
+LookupTableInsert
+LookupTableInsertV2
+LookupTableSize
+LookupTableSizeV2
+MutableDenseHashTable
+MutableDenseHashTableV2
+MutableHashTable
+MutableHashTableV2
+MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
+Mutex
+MutexAcquire
+MutexRelease
+PaddingFIFOQueue
+PaddingFIFOQueueV2
+PriorityQueue
+PriorityQueueV2
+QueueClose
+QueueCloseV2
+QueueDequeue
+QueueDequeueV2
+QueueDequeueMany
+QueueDequeueManyV2
+QueueDequeueUpTo
+QueueDequeueUpToV2
+QueueEnqueue
+QueueEnqueueV2
+QueueEnqueueMany
+QueueEnqueueManyV2
+QueueSize
+QueueSizeV2
+RandomShuffleQueue
+RandomShuffleQueueV2
+Stack
+StackClose
+StackPop
+StackPush
+StackV2
+StackCloseV2
+StackPopV2
+StackPushV2
+TensorArray
+TensorArrayClose
+TensorArrayCloseV2
+TensorArrayConcat
+TensorArrayConcatV2
+TensorArrayGather
+TensorArrayGatherV2
+TensorArrayGrad
+TensorArrayGradV2
+TensorArrayPack
+TensorArrayPackV2
+TensorArrayRead
+TensorArrayReadV2
+TensorArrayScatter
+TensorArrayScatterV2
+TensorArraySize
+TensorArraySizeV2
+TensorArraySplit
+TensorArraySplitV2
+TensorArrayUnpack
+TensorArrayUnpackV2
+TensorArrayV2
+TensorArrayWrite
+TensorArrayWriteV2
+TensorArrayV3
+TensorArrayCloseV3
+TensorArrayConcatV3
+TensorArrayGatherV3
+TensorArrayGradV3
+TensorArrayReadV3
+TensorArrayPackV3
+TensorArrayScatterV3
+TensorArraySizeV3
+TensorArraySplitV3
+TensorArrayUnpackV3
+TensorArrayWriteV3
+
+# functional_ops
+SymbolicGradient
+
+# image_ops
+AdjustContrastv2
+NonMaxSuppression
+NonMaxSuppressionV2
+RandomCrop
+ResizeBilinearGrad
+ResizeBicubicGrad
+ResizeNearestNeighborGrad
+SampleDistortedBoundingBox
+SampleDistortedBoundingBoxV2
+ScaleImageGrad
+
+# io_ops
+FixedLengthRecordReader
+IdentityReader
+ReaderNumRecordsProduced
+ReaderNumWorkUnitsCompleted
+ReaderRead
+ReaderReadUpTo
+ReaderReset
+ReaderRestoreState
+ReaderSerializeState
+ReaderWorkQueueLength
+FixedLengthRecordReaderV2
+IdentityReaderV2
+ReaderNumRecordsProducedV2
+ReaderNumWorkUnitsCompletedV2
+ReaderReadV2
+ReaderReadUpToV2
+ReaderResetV2
+ReaderRestoreStateV2
+ReaderSerializeStateV2
+ReaderWorkQueueLengthV2
+Restore
+RestoreSlice
+Save
+SaveSlices
+ShardedFilename
+ShardedFilespec
+TextLineReader
+TFRecordReader
+WholeFileReader
+TextLineReaderV2
+TFRecordReaderV2
+WholeFileReaderV2
+LMDBReader
+DecodeCSV
+
+# linalg_ops
+BatchCholesky
+BatchCholeskyGrad
+BatchMatrixDeterminant
+BatchMatrixInverse
+BatchMatrixSolve
+BatchMatrixSolveLs
+BatchMatrixTriangularSolve
+BatchSelfAdjointEig
+BatchSelfAdjointEigV2
+BatchSvd
+LogMatrixDeterminant
+MatrixExponential
+MatrixLogarithm
+MatrixSolveLs
+SelfAdjointEig
+SelfAdjointEigV2
+Svd
+
+# logging_ops
+Assert
+AudioSummary
+AudioSummaryV2
+HistogramSummary
+ImageSummary
+MergeSummary
+Print
+ScalarSummary
+TensorSummary
+TensorSummaryV2
+
+# math_ops
+Abs
+AccumulateNV2
+AddN
+AddV2
+All
+Any
+BatchMatMul
+BatchFFT
+BatchFFT2D
+BatchFFT3D
+BatchIFFT
+BatchIFFT2D
+BatchIFFT3D
+Bucketize
+ClipByValue
+Complex
+ComplexAbs
+Conj
+FloorDiv
+FloorMod
+HistogramFixedWidth
+Max
+Mean
+Min
+Mul
+Neg
+Pow
+Prod
+Range
+RealDiv
+Select
+SparseMatMul
+Sub
+Sum
+MatMul
+Sigmoid
+Tanh
+SigmoidGrad
+TanhGrad
+InvGrad
+ReciprocalGrad
+SqrtGrad
+RsqrtGrad
+TruncateDiv
+TruncateMod
+
+# nn_ops
+AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
+BatchNormWithGlobalNormalization
+BatchNormWithGlobalNormalizationGrad
+FusedBatchNorm
+FusedBatchNormV2
+SoftmaxCrossEntropyWithLogits
+SparseSoftmaxCrossEntropyWithLogits
+LRNGrad
+MaxPoolGrad
+MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
+ReluGrad
+Relu6Grad
+EluGrad
+SeluGrad
+SoftplusGrad
+SoftsignGrad
+TopK
+TopKV2
+BiasAdd
+BiasAddV1
+Relu6
+AvgPool
+MaxPool
+MaxPoolV2
+Softmax
+LogSoftmax
+FractionalAvgPoolGrad
+FractionalMaxPoolGrad
+InTopK
+InTopKV2
+
+# parsing_ops
+ParseExample
+ParseSingleSequenceExample
+
+# random_ops
+RandomGamma
+RandomPoisson
+RandomUniform
+RandomUniformInt
+RandomShuffle
+RandomStandardNormal
+ParameterizedTruncatedNormal
+TruncatedNormal
+
+# script_ops
+PyFunc
+PyFuncStateless
+EagerPyFunc
+
+# sdca_ops
+
+# state_ops
+Variable
+VariableV2
+TemporaryVariable
+DestroyTemporaryVariable
+
+# sparse_ops
+AddSparseToTensorsMap
+AddManySparseToTensorsMap
+TakeManySparseFromTensorsMap
+DeserializeManySparse
+DeserializeSparse
+SerializeManySparse
+SerializeSparse
+SparseAdd
+SparseAddGrad
+SparseConcat
+SparseCross
+SparseFillEmptyRows
+SparseFillEmptyRowsGrad
+SparseSplit
+SparseSelectLastK
+SparseReorder
+SparseReshape
+SparseToDense
+SparseTensorDenseAdd
+SparseTensorDenseMatMul
+
+# string_ops
+StringSplit
+
+# user_ops
+Fact
+
+# training_ops
+# (None)
+
+# word2vec deprecated ops
+NegTrain
+Skipgram
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3369fe3c9b37ca05311c5548dbfa3228ba04ee80..601010bce9efaf1bcc864ce28a4c0bb8f8622823 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -269,17 +269,7 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [0]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+  return _random_flip(image, 0, seed, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -301,14 +291,34 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+  return _random_flip(image, 1, seed, 'random_flip_left_right')
+
+
+def _random_flip(image, flip_index, seed, scope_name):
+  """Randomly (50% chance) flip an image along axis `flip_index`.
+    Args:
+      image: A 3-D tensor of shape `[height, width, channels].`
+      flip_index: The dimension along which to flip the image.
+                  Vertical: 0, Horizontal: 1
+      seed: A Python integer. Used to create a random seed. See
+        @{tf.set_random_seed}
+        for behavior.
+      scope_name: Name of the scope in which the ops are added.
+
+    Returns:
+      A 3-D tensor of the same type and shape as `image`.
+
+    Raises:
+      ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(
         mirror_cond,
-        lambda: array_ops.reverse(image, [1]),
+        lambda: array_ops.reverse(image, [flip_index]),
         lambda: image,
         name=scope)
     return fix_image_flip_shape(image, result)
@@ -332,16 +342,7 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_left_right', [image]):
-    image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-    elif shape.ndims == 4:
-      return array_ops.reverse(image, [2])
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  return _flip(image, 1, 'flip_left_right')
 
 
 @tf_export('image.flip_up_down')
@@ -362,14 +363,35 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_up_down', [image]):
+  return _flip(image, 0, 'flip_up_down')
+
+
+def _flip(image, flip_index, scope_name):
+  """Flip an image either horizontally or vertically.
+
+  Outputs the contents of `image` flipped along the dimension `flip_index`.
+
+  See also `reverse()`.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    flip_index: 0 For vertical, 1 for horizontal.
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+      return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [1])
+      return array_ops.reverse(image, [flip_index+1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 366a72c972f4f5d0be2fc4565c89f7bfb4ea11ea..09cf6dd238b753fa1107b1a298f8160abb89b090 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -500,10 +500,10 @@ class Orthogonal(Initializer):
 
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
-    dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
+    dtype: The data type.
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -553,10 +553,10 @@ class ConvolutionDeltaOrthogonal(Initializer):
     gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
-    dtype: The type of the output.
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
+    dtype: The data type.
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -582,7 +582,6 @@ class ConvolutionDeltaOrthogonal(Initializer):
     q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
-    # ph = d / math_ops.abs(d)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
     q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
@@ -602,6 +601,186 @@ class ConvolutionDeltaOrthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+class ConvolutionOrthogonal2D(Initializer):
+  """Initializer that generates a 2D orthogonal kernel for ConvNets.
+
+  The shape of the tensor must have length 2. The number of input
+  filters must not exceed the number of output filters.
+  The orthogonality(==isometry) is exact when the inputs are circular padded.
+  There are finite-width effects with non-circular padding (e.g. zero padding).
+
+  Args:
+    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
+      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
+      applying this convolution.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed}
+      for behavior.
+    dtype: The data type.
+  """
+
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
+    self.gain = gain
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
+    self.seed = seed
+
+  def __call__(self, shape, dtype=None, partition_info=None):
+    if dtype is None:
+      dtype = self.dtype
+    # Check the shape
+    if len(shape) != 4:
+      raise ValueError("The tensor to initialize must be four-dimensional")
+
+    if shape[-2] > shape[-1]:
+      raise ValueError("In_filters cannot be greater than out_filters.")
+
+    if shape[0] != shape[1]:
+      raise ValueError("Kernel sizes must be equal.")
+
+    kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
+    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    return kernel
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
+
+  # Helper functions.
+  def _orthogonal_matrix(self, n):
+    """Construct an n x n orthogonal matrix.
+
+    Args:
+      n: dimension.
+    Returns:
+      a n x n orthogonal matrix.
+    """
+    a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed)
+    if self.seed:
+      self.seed += 1
+    q, r = linalg_ops.qr(a)
+    d = array_ops.diag_part(r)
+    # make q uniform
+    q *= math_ops.sign(d)
+    return q
+
+  def _symmetric_projection(self, n):
+    """Compute a n x n symmetric projection matrix.
+
+    Args:
+      n: dimension.
+    Returns:
+      a n x n symmetric projection matrix, i.e. a matrix P s.t. P=P*P, P=P^T.
+    """
+    q = self._orthogonal_matrix(n)
+    # randomly zeroing out some columns
+    mask = math_ops.cast(random_ops.random_normal([n], seed=self.seed) > 0,
+                         self.dtype)
+    if self.seed:
+      self.seed += 1
+    c = math_ops.multiply(q, mask)
+    return math_ops.matmul(c, array_ops.matrix_transpose(c))
+
+  def _dict_to_tensor(self, x, k1, k2):
+    """Convert a dictionary to a tensor.
+
+    Args:
+      x: a k1 * k2 dictionary.
+      k1: first dimension of x.
+      k2: second dimension of x.
+    Returns:
+      a k1 * k2 tensor.
+    """
+
+    return array_ops.stack([array_ops.stack([x[i, j] for j in range(k2)])
+                            for i in range(k1)])
+
+  def _block_orth(self, p1, p2):
+    """Construct a 2 x 2 kernel. Used to construct orthgonal kernel.
+
+    Args:
+      p1: a symmetric projection matrix
+      p2: a symmetric projection matrix
+    Returns:
+      a 2 x 2 kernel [[p1p2,         p1(1-p2)],
+                      [(1-p1)p2, (1-p1)(1-p2)]].
+    Raises:
+      ValueError: if the dimensions of p1 and p2 are different.
+    """
+    if p1.shape.as_list() != p2.shape.as_list():
+      raise ValueError("The dimension of the matrices must be the same.")
+    n = p1.shape.as_list()[0]
+    kernel2x2 = {}
+    eye = linalg_ops.eye(n, dtype=self.dtype)
+    kernel2x2[0, 0] = math_ops.matmul(p1, p2)
+    kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2))
+    kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2)
+    kernel2x2[1, 1] = math_ops.matmul((eye - p1), (eye - p2))
+
+    return kernel2x2
+
+  def _matrix_conv(self, m1, m2):
+    """Matrix convolution.
+
+    Args:
+      m1: is a k x k dictionary, each element is a n x n matrix.
+      m2: is a l x l dictionary, each element is a n x n matrix.
+
+    Returns:
+      (k + l - 1) * (k + l - 1) dictionary each element is a n x n matrix.
+    Raises:
+      ValueError: if the entries of m1 and m2 are of different dimensions.
+    """
+
+    n = (m1[0, 0]).shape.as_list()[0]
+    if n != (m2[0, 0]).shape.as_list()[0]:
+      raise ValueError("The entries in matrices m1 and m2 "
+                       "must have the same dimensions!")
+    k = int(np.sqrt(len(m1)))
+    l = int(np.sqrt(len(m2)))
+    result = {}
+    size = k + l - 1
+    # Compute matrix convolution between m1 and m2.
+    for i in range(size):
+      for j in range(size):
+        result[i, j] = array_ops.zeros([n, n], self.dtype)
+        for index1 in range(min(k, i + 1)):
+          for index2 in range(min(k, j + 1)):
+            if (i - index1) < l and (j - index2) < l:
+              result[i, j] += math_ops.matmul(m1[index1, index2],
+                                              m2[i - index1, j - index2])
+    return result
+
+  def _orthogonal_kernel(self, ksize, cin, cout):
+    """Construct orthogonal kernel for convolution.
+
+    Args:
+      ksize: kernel size
+      cin: number of input channels
+      cout: number of output channels
+    Returns:
+      an [ksize, ksize, cin, cout] orthogonal kernel.
+    Raises:
+      ValueError: if cin > cout.
+    """
+    if cin > cout:
+      raise ValueError("The number of input channels cannot exceed "
+                       "the number of output channels.")
+    orth = self._orthogonal_matrix(cout)[0:cin, :]
+    if ksize == 1:
+      return array_ops.expand_dims(array_ops.expand_dims(orth, 0), 0)
+
+    p = self._block_orth(self._symmetric_projection(cout),
+                         self._symmetric_projection(cout))
+    for _ in range(ksize - 2):
+      temp = self._block_orth(self._symmetric_projection(cout),
+                              self._symmetric_projection(cout))
+      p = self._matrix_conv(p, temp)
+    for i in range(ksize):
+      for j in range(ksize):
+        p[i, j] = math_ops.matmul(orth, p[i, j])
+
+    return self._dict_to_tensor(p, ksize, ksize)
+
+
 @tf_export("keras.initializers.Identity", "initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
@@ -647,6 +826,7 @@ variance_scaling_initializer = VarianceScaling
 orthogonal_initializer = Orthogonal
 identity_initializer = Identity
 convolutional_delta_orthogonal = ConvolutionDeltaOrthogonal
+convolutional_orthogonal_2d = ConvolutionOrthogonal2D
 # pylint: enable=invalid-name
 
 
diff --git a/tensorflow/python/ops/inplace_ops.py b/tensorflow/python/ops/inplace_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b000086b05219d23cd88935948f88f2cc718bf
--- /dev/null
+++ b/tensorflow/python/ops/inplace_ops.py
@@ -0,0 +1,227 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inplace operations.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _inplace_helper(x, i, v, op):
+  """Applies an inplace op on (x, i, v).
+
+  op is one of gen_array_ops.alias_inplace_update,
+  gen_array_ops.alias_inplace_add, or gen_array_ops.alias_inplace_sub.
+
+  If i is None, x and v must be the same shape. Computes
+    x op v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] op v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] op v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+    op: alias_inplace_update, alias_inplace_add, or alias_inplace_sub.
+
+  Returns:
+    Returns x.
+
+  """
+  x = ops.convert_to_tensor(x)
+  v = ops.convert_to_tensor(v, x.dtype)
+  if i is None:
+    # Full tensor.
+    return array_ops.reshape(
+        op(array_ops.reshape(x, [1, -1]), [0], array_ops.reshape(v, [1, -1])),
+        array_ops.shape(x))
+  i = math_ops.to_int32(i)
+  if i.get_shape().ndims == 0:
+    # Single 0-dim update.
+    return op(x, array_ops.reshape(i, [1]), array_ops.expand_dims(v, 0))
+  return op(x, i, v)
+
+
+def alias_inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_update)
+
+
+def alias_inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_add)
+
+
+def alias_inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_sub)
+
+
+def empty_like(x, init=None):
+  """Returns a non-initialized tensor with the same shape and dtype as x.
+
+  Args:
+    x: A Tensor.
+    init: Initialize the returned tensor with the default value of
+      x.dtype(), if True. Otherwise, do not initialize. Defaults to
+      None.
+
+  Returns:
+    A tensor y, whose dtype and shape are the same as those of x.
+    y is guaranteed not to be an alias of x. Upon return, y may contain
+    arbitrary data.
+
+  """
+  x = ops.convert_to_tensor(x)
+  return gen_array_ops.empty(array_ops.shape(x), x.dtype, init=init)
+
+
+def inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_update(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_add(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_sub(gen_array_ops.deep_copy(x), i, v)
+
+empty = gen_array_ops.empty
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 193c787baa2ac68feec7e5d8bb03b251fc78d781..8cfe964b1c0a572f43a14c66885e74ea105b0916 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -699,9 +699,10 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
-      return linalg_ops.cholesky_solve(
+      return linear_operator_util.cholesky_solve_with_broadcast(
           linalg_ops.cholesky(self.to_dense()), rhs)
-    return linalg_ops.matrix_solve(self.to_dense(), rhs, adjoint=adjoint)
+    return linear_operator_util.matrix_solve_with_broadcast(
+        self.to_dense(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 5ba3b090ae9decaba239b31226db84c2d7b254bd..746da8df1ce957e86bc2e730b5709a699adbf612 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -176,7 +176,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return array_ops.shape(self._matrix)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index be911029095920d424ac90b406e7b85b73884b3b..08e5896e1034fb1782beacfb18fef16da083bded 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -365,14 +366,17 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      uh_x = linear_operator_util.matmul_with_broadcast(
+          u, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_uh_x = d.matmul(uh_x, adjoint=adjoint)
-      v_d_uh_x = math_ops.matmul(v, d_uh_x)
+      v_d_uh_x = linear_operator_util.matmul_with_broadcast(
+          v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      vh_x = linear_operator_util.matmul_with_broadcast(
+          v, x, adjoint_a=True, adjoint_b=adjoint_arg)
       d_vh_x = d.matmul(vh_x, adjoint=adjoint)
-      u_d_vh_x = math_ops.matmul(u, d_vh_x)
+      u_d_vh_x = linear_operator_util.matmul_with_broadcast(u, d_vh_x)
       return leading_term + u_d_vh_x
 
   def _determinant(self):
@@ -431,16 +435,18 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} rhs
     linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
-    vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
+    vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
     if self._use_cholesky:
-      capinv_vh_linv_rhs = linalg_ops.cholesky_solve(
+      capinv_vh_linv_rhs = linear_operator_util.cholesky_solve_with_broadcast(
           self._chol_capacitance, vh_linv_rhs)
     else:
-      capinv_vh_linv_rhs = linalg_ops.matrix_solve(
+      capinv_vh_linv_rhs = linear_operator_util.matrix_solve_with_broadcast(
           self._capacitance, vh_linv_rhs, adjoint=adjoint)
     # U C^{-1} V^H M^{-1} rhs
-    u_capinv_vh_linv_rhs = math_ops.matmul(u, capinv_vh_linv_rhs)
+    u_capinv_vh_linv_rhs = linear_operator_util.matmul_with_broadcast(
+        u, capinv_vh_linv_rhs)
     # L^{-1} U C^{-1} V^H L^{-1} rhs
     linv_u_capinv_vh_linv_rhs = l.solve(u_capinv_vh_linv_rhs, adjoint=adjoint)
 
@@ -454,7 +460,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     # L^{-1} U
     linv_u = self.base_operator.solve(self.u)
     # V^H L^{-1} U
-    vh_linv_u = math_ops.matmul(self.v, linv_u, adjoint_a=True)
+    vh_linv_u = linear_operator_util.matmul_with_broadcast(
+        self.v, linv_u, adjoint_a=True)
 
     # D^{-1} + V^H L^{-1} V
     capacitance = self._diag_inv_operator.add_to_tensor(vh_linv_u)
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index c4d386ccb4efc7dede8310243e517fe2f6b45bd9..fb1eb2fedba5b47ce38f9635527b91e18d894a8f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -194,7 +193,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         message="Singular operator:  Diagonal contained zero values.")
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    return math_ops.matmul(
+    return linear_operator_util.matmul_with_broadcast(
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
@@ -206,7 +205,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    return linalg_ops.matrix_triangular_solve(
+    return linear_operator_util.matrix_triangular_solve_with_broadcast(
         self._tril, rhs, lower=True, adjoint=adjoint)
 
   def _to_dense(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index ce1a112ad584a14298be6e471578858ef31573d5..9c8abb97406516dddbc450db266bd61f52f35bd7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
 
@@ -126,13 +127,16 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("Not implemented yet.")
 
   @abc.abstractmethod
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     """Make a rhs appropriate for calling operator.solve(rhs).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making a 'rhs' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `rhs` with the same batch
+        shape as operator, and otherwise create a matrix without any batch
+        shape.
 
     Returns:
       A `Tensor`
@@ -140,13 +144,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("_make_rhs is not defined.")
 
   @abc.abstractmethod
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
       adjoint:  Python `bool`.  If `True`, we are making an 'x' value for the
         adjoint operator.
+      with_batch: Python `bool`. If `True`, create `x` with the same batch shape
+        as operator, and otherwise create a matrix without any batch shape.
 
     Returns:
       A `Tensor`
@@ -224,8 +230,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 [op_log_abs_det, mat_log_abs_det], feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_matmul(self):
-    self._skip_if_tests_to_skip_contains("matmul")
+  def _test_matmul(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
@@ -235,7 +240,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                     build_info, dtype, use_placeholder=use_placeholder)
-                x = self._make_x(operator, adjoint=adjoint)
+                x = self._make_x(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, compute A X^H^H = A X.
                 if adjoint_arg:
                   op_matmul = operator.matmul(
@@ -244,7 +250,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                       adjoint_arg=adjoint_arg)
                 else:
                   op_matmul = operator.matmul(x, adjoint=adjoint)
-                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                mat_matmul = linear_operator_util.matmul_with_broadcast(
+                    mat, x, adjoint_a=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_matmul.get_shape(),
                                       mat_matmul.get_shape())
@@ -252,8 +259,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_matmul, mat_matmul], feed_dict=feed_dict)
                 self.assertAC(op_matmul_v, mat_matmul_v)
 
-  def test_solve(self):
-    self._skip_if_tests_to_skip_contains("solve")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
+    self._test_matmul(with_batch=True)
+
+  def test_matmul_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
+    self._test_matmul(with_batch=False)
+
+  def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
         for dtype in self._dtypes_to_test:
@@ -263,7 +277,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                     build_info, dtype, use_placeholder=use_placeholder)
-                rhs = self._make_rhs(operator, adjoint=adjoint)
+                rhs = self._make_rhs(
+                    operator, adjoint=adjoint, with_batch=with_batch)
                 # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
                 if adjoint_arg:
                   op_solve = operator.solve(
@@ -273,7 +288,8 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 else:
                   op_solve = operator.solve(
                       rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
-                mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
+                mat_solve = linear_operator_util.matrix_solve_with_broadcast(
+                    mat, rhs, adjoint=adjoint)
                 if not use_placeholder:
                   self.assertAllEqual(op_solve.get_shape(),
                                       mat_solve.get_shape())
@@ -281,6 +297,14 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_solve, mat_solve], feed_dict=feed_dict)
                 self.assertAC(op_solve_v, mat_solve_v)
 
+  def test_solve(self):
+    self._skip_if_tests_to_skip_contains("solve")
+    self._test_solve(with_batch=True)
+
+  def test_solve_with_broadcast(self):
+    self._skip_if_tests_to_skip_contains("solve_with_broadcast")
+    self._test_solve(with_batch=False)
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -358,13 +382,13 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # This operator is square, so rhs and x will have same shape.
     # adjoint value makes no difference because the operator shape doesn't
     # change since it is square, but be pedantic.
-    return self._make_x(operator, adjoint=not adjoint)
+    return self._make_x(operator, adjoint=not adjoint, with_batch=with_batch)
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Value of adjoint makes no difference because the operator is square.
     # Return the number of systems to solve, R, equal to 1 or 2.
     r = self._get_num_systems(operator)
@@ -373,11 +397,17 @@ class SquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
     if operator.shape.is_fully_defined():
       batch_shape = operator.batch_shape.as_list()
       n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
@@ -404,7 +434,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "det", "log_abs_det"]
+    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
@@ -417,12 +447,12 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         build_info((3, 3, 4)),
         build_info((2, 1, 2, 4))]
 
-  def _make_rhs(self, operator, adjoint):
+  def _make_rhs(self, operator, adjoint, with_batch=True):
     # TODO(langmore) Add once we're testing solve_ls.
     raise NotImplementedError(
         "_make_rhs not implemented because we don't test solve")
 
-  def _make_x(self, operator, adjoint):
+  def _make_x(self, operator, adjoint, with_batch=True):
     # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
@@ -433,14 +463,20 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         n = operator.range_dimension.value
       else:
         n = operator.domain_dimension.value
-      x_shape = batch_shape + [n, r]
+      if with_batch:
+        x_shape = batch_shape + [n, r]
+      else:
+        x_shape = [n, r]
     else:
       batch_shape = operator.batch_shape_tensor()
       if adjoint:
         n = operator.range_dimension_tensor()
       else:
         n = operator.domain_dimension_tensor()
-      x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      if with_batch:
+        x_shape = array_ops.concat((batch_shape, [n, r]), 0)
+      else:
+        x_shape = [n, r]
 
     return random_normal(x_shape, dtype=operator.dtype)
 
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bba59ebcef9c7caf1a53d724767999ae7ac079e5..bdf0774bbf834ec10f68423e89e3b8b9b96ad9a1 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -54,8 +54,8 @@ def _TensorListStackGrad(unused_op, dtensor):
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape[0] is not None:
-    num_elements = op.inputs[0].shape[0]
+  if op.inputs[0].shape[0].value is not None:
+    num_elements = op.inputs[0].shape[0].value
   else:
     num_elements = None
   if dlist is None:
@@ -63,9 +63,10 @@ def _TensorListFromTensorGrad(op, dlist):
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype,
-      num_elements=num_elements)
+  tensor_grad = gen_list_ops.tensor_list_stack(
+      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+  shape_grad = None
+  return tensor_grad, shape_grad
 
 
 @ops.RegisterGradient("TensorListGetItem")
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 34ca1adc3e13dc67560fb21d70c16cd42dc40552..9fc545c9678e7eb33a7ad35e2a84f890885e09af 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -306,11 +307,8 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   if labels is None:
     raise ValueError("labels must not be None.")
@@ -696,7 +694,7 @@ def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
+  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
@@ -707,11 +705,16 @@ def softmax_cross_entropy(
       new_onehot_labels = onehot_labels * (1 - label_smoothing)
                           + label_smoothing / num_classes
 
+  Note that `onehot_labels` and `logits` must have the same shape,
+  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
+  broadcastable to loss, whose shape is decided by the shape of `logits`.
+  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
+  a `Tensor` of shape `[batch_size]`.
+
   Args:
-    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: `[batch_size, num_classes]` logits outputs of the network .
-    weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
-      broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
+    onehot_labels: One-hot-encoded labels.
+    logits: Logits outputs of the network.
+    weights: Optional `Tensor` that is broadcastable to loss.
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 01d670ea2d93d54f558a9be6aabf7b6b237f65ef..2a732b905bfeaa7538f806441936dfa384f7490d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -211,11 +211,9 @@ def argmax(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
@@ -231,11 +229,9 @@ def argmin(input,
            name=None,
            dimension=None,
            output_type=dtypes.int64):
-  if dimension is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dimension'")
-    axis = dimension
-  elif axis is None:
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "dimension", dimension)
+  if axis is None:
     axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
@@ -1632,7 +1628,7 @@ def reduce_min(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1681,7 +1677,7 @@ def reduce_max(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index ee1a00623a734e18d4aebe6c84f77ba53ee1050c..1d0d9a52a125b32c8fba65f8b2c9907331296836 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -98,6 +98,7 @@ See the @{$python/nn} guide.
 @@fixed_unigram_candidate_sampler
 @@compute_accidental_hits
 @@quantized_conv2d
+@@quantized_relu
 @@quantized_relu_x
 @@quantized_max_pool
 @@quantized_avg_pool
@@ -126,8 +127,6 @@ from tensorflow.python.ops.nn_impl import *
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
-from tensorflow.python.ops.rnn import *
-from tensorflow.python.ops import rnn_cell
 # pylint: enable=wildcard-import,unused-import
 
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 47cc4da7f2abd1f5b00e193a76c8391be94ca27d..d0d5ed07ced362708ffb08e02f37562d5d7616fd 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -987,7 +987,7 @@ def _compute_sampled_logits(weights,
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1012,7 +1012,7 @@ def _compute_sampled_logits(weights,
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
-        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+        `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
@@ -1285,7 +1285,7 @@ def sampled_softmax_loss(weights,
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
-    loss = tf.nn.softmax_cross_entropy_with_logits(
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
         labels=labels_one_hot,
         logits=logits)
   ```
@@ -1303,7 +1303,7 @@ def sampled_softmax_loss(weights,
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
-        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
@@ -1340,7 +1340,8 @@ def sampled_softmax_loss(weights,
       partition_strategy=partition_strategy,
       name=name,
       seed=seed)
-  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
+  labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+  sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 07ca32953f796466964d4555e45052fcf3c53ce0..ea83ba77484326455747be09f0fc73018d8bc58e 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1803,8 +1803,11 @@ def softmax_cross_entropy_with_logits_v2(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
@@ -1816,14 +1819,17 @@ def softmax_cross_entropy_with_logits_v2(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1926,9 +1932,9 @@ def softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
-  or `float64`).
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
 
   Backpropagation will happen only into `logits`.  To calculate a cross entropy
   loss that allows backpropagation into both `logits` and `labels`, see
@@ -1939,14 +1945,17 @@ def softmax_cross_entropy_with_logits(
 
   Args:
     _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1977,14 +1986,17 @@ def sparse_softmax_cross_entropy_with_logits(
   must provide a single specific index for the true class for each row of
   `logits` (each minibatch entry).  For soft softmax classification with
   a probability distribution for each entry, see
-  `softmax_cross_entropy_with_logits`.
+  `softmax_cross_entropy_with_logits_v2`.
 
   **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits of shape `[batch_size, num_classes]` and
-  labels of shape `[batch_size]`. But higher dimensions are supported.
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  case the `dim`-th dimension is assumed to be of size `num_classes`.
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 075b38d743d13329e646c0b268e938b5c5704e47..d8d9af545f17fe3e0133b51b1eab82f7732dc299 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -1176,8 +1176,13 @@ def _parse_single_sequence_example_raw(serialized,
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export("decode_csv")
-def decode_csv(records, record_defaults, field_delim=",",
-               use_quote_delim=True, name=None, na_value=""):
+def decode_csv(records,
+               record_defaults,
+               field_delim=",",
+               use_quote_delim=True,
+               name=None,
+               na_value="",
+               select_cols=None):
   """Convert CSV records to tensors. Each column maps to one tensor.
 
   RFC 4180 format is expected for the CSV records.
@@ -1200,19 +1205,32 @@ def decode_csv(records, record_defaults, field_delim=",",
       Bullet 5).
     name: A name for the operation (optional).
     na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
 
   Returns:
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
   """
-  # TODO(martinwicke), remove the wrapper when new Python API generator is done.
+  if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
+                                     for i in range(len(select_cols) - 1)):
+    raise ValueError("select_cols is not strictly increasing.")
+  if select_cols is not None and select_cols[0] < 0:
+    raise ValueError("select_cols contains negative values.")
+  if select_cols is not None and len(select_cols) != len(record_defaults):
+    raise ValueError("Length of select_cols and record_defaults do not match.")
   return gen_parsing_ops.decode_csv(
       records=records,
       record_defaults=record_defaults,
       field_delim=field_delim,
       use_quote_delim=use_quote_delim,
       na_value=na_value,
-      name=name)
+      name=name,
+      select_cols=select_cols,
+  )
 
 
 # TODO(b/70890287): Combine the implementation of this op and
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 508ba9bfeeb4dcb627288357c1c7b6ab4ef14c5c..49dd7f9948db1bf9a9f7f3c1d02c01fea3de3ceb 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -522,11 +522,19 @@ class ResourceVariable(variables.Variable):
     else:
       self._initial_value = None
     if variable_def.snapshot_name:
-      self._cached_value = g.as_graph_element(
+      snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
+      self._cached_value = snapshot
+      while snapshot.op.type != "ReadVariableOp":
+        snapshot = snapshot.op.inputs[0]
+      self._graph_element = snapshot
     else:
       self._cached_value = None
+      # Legacy case for protos without the snapshot name; assume it's the
+      # following.
+      self._graph_element = g.get_tensor_by_name(
+          self._handle.op.name + "/Read/ReadVariableOp:0")
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def,
@@ -535,8 +543,6 @@ class ResourceVariable(variables.Variable):
       self._save_slice_info = None
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
-    self._graph_element = g.get_tensor_by_name(
-        self._handle.op.name + "/Read/ReadVariableOp:0")
     self._constraint = None
     self._cached_shape_as_list = None
 
@@ -745,6 +751,10 @@ class ResourceVariable(variables.Variable):
       if self._cached_value is not None:
         var_def.snapshot_name = ops.strip_name_scope(self._cached_value.name,
                                                      export_scope)
+      else:
+        # Store the graph_element here
+        var_def.snapshot_name = ops.strip_name_scope(self._graph_element.name,
+                                                     export_scope)
       var_def.is_resource = True
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
@@ -910,7 +920,6 @@ class ResourceVariable(variables.Variable):
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
     if dtype is not None and dtype != self.dtype:
-      print("trying to switch the dtype to ", dtype, " from ", self.dtype)
       return NotImplemented
     if as_ref:
       return self.read_value().op.inputs[0]
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index fe380c44dafdad6dc25d50102bacba610132674d..54f4e0f2407393e1a617633d886b43ab59cade29 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -599,9 +599,9 @@ class BasicLSTMCell(LayerRNNCell):
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `[batch_size, num_units]`, if `state_is_tuple` has been set to
         `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * self.state_size]`.
+        `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
@@ -1206,7 +1206,16 @@ class DeviceWrapper(RNNCell):
 
 @tf_export("nn.rnn_cell.MultiRNNCell")
 class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
+  """RNN cell composed sequentially of multiple simple cells.
+
+  Example:
+
+  ```python
+  num_units = [128, 64]
+  cells = [BasicLSTMCell(num_units=n) for n in num_units]
+  stacked_rnn_cell = MultiRNNCell(cells)
+  ```
+  """
 
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/python/ops/summary_ops_v2.py
similarity index 90%
rename from tensorflow/contrib/summary/summary_ops.py
rename to tensorflow/python/ops/summary_ops_v2.py
index bc763fe655edc455e2538e536d6efab314c8228c..12f361c513fcebf8ce4b9c367d101b11ab10260b 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -31,7 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import utils
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_summary_ops
@@ -108,8 +108,10 @@ class SummaryWriter(object):
   - @{tf.contrib.summary.create_db_writer}
   """
 
-  def  __init__(self, resource):
+  def  __init__(self, resource, init_op_fn):
     self._resource = resource
+    # TODO(nickfelt): cache constructed ops in graph mode
+    self._init_op_fn = init_op_fn
     if context.executing_eagerly() and self._resource is not None:
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
@@ -129,10 +131,32 @@ class SummaryWriter(object):
       yield self
       # Flushes the summary writer in eager mode or in graph functions, but not
       # in legacy graph mode (you're on your own there).
-      with ops.device("cpu:0"):
-        gen_summary_ops.flush_summary_writer(self._resource)
+      self.flush()
       context.context().summary_writer_resource = old
 
+  def init(self):
+    """Operation to initialize the summary writer resource."""
+    if self._resource is not None:
+      return self._init_op_fn()
+
+  def _flush(self):
+    return _flush_fn(writer=self)
+
+  def flush(self):
+    """Operation to force the summary writer to flush any buffered data."""
+    if self._resource is not None:
+      return self._flush()
+
+  def _close(self):
+    with ops.control_dependencies([self.flush()]):
+      with ops.device("cpu:0"):
+        return gen_summary_ops.close_summary_writer(self._resource)
+
+  def close(self):
+    """Operation to flush and close the summary writer resource."""
+    if self._resource is not None:
+      return self._close()
+
 
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
@@ -178,7 +202,7 @@ def create_file_writer(logdir,
                        flush_millis=None,
                        filename_suffix=None,
                        name=None):
-  """Creates a summary file writer in the current context.
+  """Creates a summary file writer in the current context under the given name.
 
   Args:
     logdir: a string, or None. If a string, creates a summary file writer
@@ -186,18 +210,20 @@ def create_file_writer(logdir,
      a mock object which acts like a summary writer but does nothing,
      useful to use as a context manager.
     max_queue: the largest number of summaries to keep in a queue; will
-     flush once the queue gets bigger than this.
-    flush_millis: the largest interval between flushes.
-    filename_suffix: optional suffix for the event file name.
+     flush once the queue gets bigger than this. Defaults to 10.
+    flush_millis: the largest interval between flushes. Defaults to 120,000.
+    filename_suffix: optional suffix for the event file name. Defaults to `.v2`.
     name: Shared name for this SummaryWriter resource stored to default
-      Graph.
+      Graph. Defaults to the provided logdir prefixed with `logdir:`. Note: if a
+      summary writer resource with this shared name already exists, the returned
+      SummaryWriter wraps that resource and the other arguments have no effect.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
     summary writer.
   """
   if logdir is None:
-    return SummaryWriter(None)
+    return SummaryWriter(None, None)
   with ops.device("cpu:0"):
     if max_queue is None:
       max_queue = constant_op.constant(10)
@@ -205,6 +231,8 @@ def create_file_writer(logdir,
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant(".v2")
+    if name is None:
+      name = "logdir:" + logdir
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
@@ -267,13 +295,12 @@ def create_db_writer(db_uri,
 
 def _make_summary_writer(name, factory, **kwargs):
   resource = gen_summary_ops.summary_writer(shared_name=name)
+  init_op_fn = lambda: factory(resource, **kwargs)
   # TODO(apassos): Consider doing this instead.
-  # node = factory(resource, **kwargs)
   # if not context.executing_eagerly():
-  #   ops.get_default_session().run(node)
-  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
-                        factory(resource, **kwargs))
-  return SummaryWriter(resource)
+  #   ops.get_default_session().run(init_op)
+  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME, init_op_fn())
+  return SummaryWriter(resource, init_op_fn)
 
 
 def _cleanse_string(name, pattern, value):
@@ -341,7 +368,7 @@ def summary_writer_function(name, tensor, function, family=None):
   if context.context().summary_writer_resource is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
-    op = utils.smart_cond(
+    op = smart_cond.smart_cond(
         should_record_summaries(), record, _nothing, name="")
     ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
@@ -538,7 +565,14 @@ def flush(writer=None, name=None):
     writer = context.context().summary_writer_resource
     if writer is None:
       return control_flow_ops.no_op()
-  return gen_summary_ops.flush_summary_writer(writer, name=name)
+  else:
+    if isinstance(writer, SummaryWriter):
+      writer = writer._resource  # pylint: disable=protected-access
+  with ops.device("cpu:0"):
+    return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
+_flush_fn = flush  # for within SummaryWriter.flush()
 
 
 def eval_dir(model_dir, name=None):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index e33085ba626a7645be64941dd4da8e6943292e7e..ba213ef884165f7f72094d27932913e39c9a5901 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -307,6 +307,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # Variable creation and initialization takes place in `init_scope`s;
+        # as such, if an `init_scope` lifts us into the eager context, then we
+        # need to use `ResourceVariable`s.
+        use_resource = True
+
+    # Note that it's fine to reuse eager variables whose initialization was
+    # lifted from a function-building graph into the eager context (that's why
+    # the following clause is not wrapped in an `init_scope`); lifted variables
+    # are tracked by the graph's `VariableStore`.
     if context.executing_eagerly():
       if not self._store_eager_variables and reuse:
         raise RuntimeError(
@@ -315,7 +326,6 @@ class _VariableStore(object):
             " EagerVariableStore for example usage.")
       if self._store_eager_variables:
         reuse = AUTO_REUSE
-      use_resource = True
 
     # If a *_ref type is passed in an error would be triggered further down the
     # stack. We prevent this using base_dtype to get a non-ref version of the
diff --git a/tensorflow/python/summary/writer/event_file_writer_v2.py b/tensorflow/python/summary/writer/event_file_writer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c66c0f7a8fd4c65d3539d257b4e4fa89f839a98
--- /dev/null
+++ b/tensorflow/python/summary/writer/event_file_writer_v2.py
@@ -0,0 +1,140 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Writes events to disk in a logdir."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
+
+
+class EventFileWriterV2(object):
+  """Writes `Event` protocol buffers to an event file via the graph.
+
+  The `EventFileWriterV2` class is backed by the summary file writer in the v2
+  summary API (currently in tf.contrib.summary), so it uses a shared summary
+  writer resource and graph ops to write events.
+
+  As with the original EventFileWriter, this class will asynchronously write
+  Event protocol buffers to the backing file. The Event file is encoded using
+  the tfrecord format, which is similar to RecordIO.
+  """
+
+  def __init__(self, session, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=''):
+    """Creates an `EventFileWriterV2` and an event file to write to.
+
+    On construction, this calls `tf.contrib.summary.create_file_writer` within
+    the graph from `session.graph` to look up a shared summary writer resource
+    for `logdir` if one exists, and create one if not. Creating the summary
+    writer resource in turn creates a new event file in `logdir` to be filled
+    with `Event` protocol buffers passed to `add_event`. Graph ops to control
+    this writer resource are added to `session.graph` during this init call;
+    stateful methods on this class will call `session.run()` on these ops.
+
+    Note that because the underlying resource is shared, it is possible that
+    other parts of the code using the same session may interact independently
+    with the resource, e.g. by flushing or even closing it. It is the caller's
+    responsibility to avoid any undesirable sharing in this regard.
+
+    The remaining arguments to the constructor (`flush_secs`, `max_queue`, and
+    `filename_suffix`) control the construction of the shared writer resource
+    if one is created. If an existing resource is reused, these arguments have
+    no effect.  See `tf.contrib.summary.create_file_writer` for details.
+
+    Args:
+      session: A `tf.Session`. Session that will hold shared writer resource.
+        The writer ops will be added to session.graph during this init call.
+      logdir: A string. Directory where event file will be written.
+      max_queue: Integer. Size of the queue for pending events and summaries.
+      flush_secs: Number. How often, in seconds, to flush the
+        pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
+    """
+    self._session = session
+    self._logdir = logdir
+    self._closed = False
+    if not gfile.IsDirectory(self._logdir):
+      gfile.MakeDirs(self._logdir)
+
+    with self._session.graph.as_default():
+      with ops.name_scope('filewriter'):
+        file_writer = summary_ops_v2.create_file_writer(
+            logdir=self._logdir,
+            max_queue=max_queue,
+            flush_millis=flush_secs * 1000,
+            filename_suffix=filename_suffix)
+        with summary_ops_v2.always_record_summaries(), file_writer.as_default():
+          self._event_placeholder = array_ops.placeholder_with_default(
+              constant_op.constant('unused', dtypes.string),
+              shape=[])
+          self._add_event_op = summary_ops_v2.import_event(
+              self._event_placeholder)
+        self._init_op = file_writer.init()
+        self._flush_op = file_writer.flush()
+        self._close_op = file_writer.close()
+      self._session.run(self._init_op)
+
+  def get_logdir(self):
+    """Returns the directory where event file will be written."""
+    return self._logdir
+
+  def reopen(self):
+    """Reopens the EventFileWriter.
+
+    Can be called after `close()` to add more events in the same directory.
+    The events will go into a new events file.
+
+    Does nothing if the EventFileWriter was not closed.
+    """
+    if self._closed:
+      self._closed = False
+      self._session.run(self._init_op)
+
+  def add_event(self, event):
+    """Adds an event to the event file.
+
+    Args:
+      event: An `Event` protocol buffer.
+    """
+    if not self._closed:
+      event_pb = event.SerializeToString()
+      self._session.run(
+          self._add_event_op, feed_dict={self._event_placeholder: event_pb})
+
+  def flush(self):
+    """Flushes the event file to disk.
+
+    Call this method to make sure that all pending events have been written to
+    disk.
+    """
+    self._session.run(self._flush_op)
+
+  def close(self):
+    """Flushes the event file to disk and close the file.
+
+    Call this method when you do not need the summary writer anymore.
+    """
+    if not self._closed:
+      self.flush()
+      self._session.run(self._close_op)
+      self._closed = True
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 57f78c156b1334a5486b29f2ddec957e49156e73..aca084fc9168e710316e4c988594cff69e54ebab 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.summary.writer.event_file_writer_v2 import EventFileWriterV2
 from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
@@ -286,6 +287,11 @@ class FileWriter(SummaryToEventTransformer):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
+
+  When constructed with a `tf.Session` parameter, a `FileWriter` instead forms
+  a compatibility layer over new graph-based summaries (`tf.contrib.summary`)
+  to facilitate the use of new summary writing with pre-existing code that
+  expects a `FileWriter` instance.
   """
 
   def __init__(self,
@@ -294,10 +300,11 @@ class FileWriter(SummaryToEventTransformer):
                max_queue=10,
                flush_secs=120,
                graph_def=None,
-               filename_suffix=None):
-    """Creates a `FileWriter` and an event file.
+               filename_suffix=None,
+               session=None):
+    """Creates a `FileWriter`, optionally shared within the given session.
 
-    On construction the summary writer creates a new event file in `logdir`.
+    Typically, constructing a file writer creates a new event file in `logdir`.
     This event file will contain `Event` protocol buffers constructed when you
     call one of the following functions: `add_summary()`, `add_session_log()`,
     `add_event()`, or `add_graph()`.
@@ -317,13 +324,16 @@ class FileWriter(SummaryToEventTransformer):
     writer = tf.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
+    The `session` argument to the constructor makes the returned `FileWriter` a
+    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    Crucially, this means the underlying writer resource and events file will
+    be shared with any other `FileWriter` using the same `session` and `logdir`,
+    and with any `tf.contrib.summary.SummaryWriter` in this session using the
+    the same shared resource name (which by default scoped to the logdir). If
+    no such resource exists, one will be created using the remaining arguments
+    to this constructor, but if one already exists those arguments are ignored.
+    In either case, ops will be added to `session.graph` to control the
+    underlying file writer resource. See `tf.contrib.summary` for more details.
 
     Args:
       logdir: A string. Directory where event file will be written.
@@ -334,6 +344,7 @@ class FileWriter(SummaryToEventTransformer):
       graph_def: DEPRECATED: Use the `graph` argument instead.
       filename_suffix: A string. Every event file's name is suffixed with
         `suffix`.
+      session: A `tf.Session` object. See details above.
 
     Raises:
       RuntimeError: If called with eager execution enabled.
@@ -347,9 +358,12 @@ class FileWriter(SummaryToEventTransformer):
       raise RuntimeError(
           "tf.summary.FileWriter is not compatible with eager execution. "
           "Use tf.contrib.summary instead.")
-
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
-                                   filename_suffix)
+    if session is not None:
+      event_writer = EventFileWriterV2(
+          session, logdir, max_queue, flush_secs, filename_suffix)
+    else:
+      event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                     filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def __enter__(self):
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 88ade0aac33f1cd8f9d8cb30344aabca76a13511..dc990c2602427049ecdb7588ff217207a69cbcd2 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -29,10 +29,12 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import plugin_asset
@@ -42,7 +44,10 @@ from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.util import compat
 
 
-class SummaryWriterTestCase(test.TestCase):
+class FileWriterTestCase(test.TestCase):
+
+  def _FileWriter(self, *args, **kwargs):
+    return writer.FileWriter(*args, **kwargs)
 
   def _TestDir(self, test_name):
     test_dir = os.path.join(self.get_temp_dir(), test_name)
@@ -96,7 +101,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.add_summary(
@@ -171,7 +176,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, graph=g)
+    sw = self._FileWriter(test_dir, graph=g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -179,7 +184,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, g)
+    sw = self._FileWriter(test_dir, g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -188,7 +193,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, graph_def=gd)
+    sw = self._FileWriter(test_dir, graph_def=gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -197,7 +202,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, gd)
+    sw = self._FileWriter(test_dir, gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -207,18 +212,18 @@ class SummaryWriterTestCase(test.TestCase):
       with ops.Graph().as_default() as g:
         constant_op.constant([12], name="douze")
       gd = g.as_graph_def()
-      sw = writer.FileWriter(test_dir, graph=g, graph_def=gd)
+      sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
-      sw = writer.FileWriter(test_dir, "string instead of graph object")
+      sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.close()
     # Sleep at least one second to make sure we get a new event file name.
@@ -261,7 +266,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     # Sleep 1.2 seconds to make sure event queue is empty.
     time.sleep(1.2)
     time_before_close = time.time()
@@ -270,7 +275,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
-    with writer.FileWriter(test_dir) as sw:
+    with self._FileWriter(test_dir) as sw:
       sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
     self.assertEquals(1, len(event_paths))
@@ -280,7 +285,7 @@ class SummaryWriterTestCase(test.TestCase):
   # protocol buffers correctly.
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     with self.test_session():
       i = constant_op.constant(1, dtype=dtypes.int32, shape=[])
       l = constant_op.constant(2, dtype=dtypes.int64, shape=[])
@@ -327,7 +332,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
 
@@ -386,7 +391,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
-    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
     for _ in range(10):
       sw.add_summary(
           summary_pb2.Summary(value=[
@@ -400,9 +405,178 @@ class SummaryWriterTestCase(test.TestCase):
     for filename in event_filenames:
       self.assertTrue(filename.endswith("_test_suffix"))
 
+  def testPluginAssetSerialized(self):
+    class ExamplePluginAsset(plugin_asset.PluginAsset):
+      plugin_name = "example"
+
+      def assets(self):
+        return {"foo.txt": "foo!", "bar.txt": "bar!"}
+
+    with ops.Graph().as_default() as g:
+      plugin_asset.get_plugin_asset(ExamplePluginAsset)
+
+      logdir = self.get_temp_dir()
+      fw = self._FileWriter(logdir)
+      fw.add_graph(g)
+    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
+
+    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "foo!")
+
+    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "bar!")
 
-class SummaryWriterCacheTest(test.TestCase):
-  """SummaryWriterCache tests."""
+
+class SessionBasedFileWriterTestCase(FileWriterTestCase):
+  """Tests for FileWriter behavior when passed a Session argument."""
+
+  def _FileWriter(self, *args, **kwargs):
+    if "session" not in kwargs:
+      # Pass in test_session() as the session. It will be cached during this
+      # test method invocation so that any other use of test_session() with no
+      # graph should result in re-using the same underlying Session.
+      with self.test_session() as sess:
+        kwargs["session"] = sess
+        return writer.FileWriter(*args, **kwargs)
+    return writer.FileWriter(*args, **kwargs)
+
+  def _createTaggedSummary(self, tag):
+    summary = summary_pb2.Summary()
+    summary.value.add(tag=tag)
+    return summary
+
+  def testSharing_withOtherSessionBasedFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # File writer, should share file with writer1
+      writer2 = writer.FileWriter(session=sess, logdir=logdir)
+      writer2.add_summary(self._createTaggedSummary("two"), 2)
+      writer2.flush()
+
+      # File writer with different logdir (shouldn't be in this logdir at all)
+      writer3 = writer.FileWriter(session=sess, logdir=logdir + "-other")
+      writer3.add_summary(self._createTaggedSummary("three"), 3)
+      writer3.flush()
+
+      # File writer in a different session (should be in separate file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer4.add_summary(self._createTaggedSummary("four"), 4)
+        writer4.flush()
+
+      # One more file writer, should share file with writer1
+      writer5 = writer.FileWriter(session=sess, logdir=logdir)
+      writer5.add_summary(self._createTaggedSummary("five"), 5)
+      writer5.flush()
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "four"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+    # Just check that the other logdir file exists to be sure we wrote it
+    self.assertTrue(glob.glob(os.path.join(logdir + "-other", "event*")))
+
+  def testSharing_withExplicitSummaryFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer via FileWriter(session=?)
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # Next one via create_file_writer(), should use same file
+      writer2 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer2.as_default():
+        summary2 = summary_ops_v2.scalar("two", 2.0, step=2)
+      sess.run(writer2.init())
+      sess.run(summary2)
+      sess.run(writer2.flush())
+
+      # Next has different shared name, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops_v2.create_file_writer(logdir=logdir, name="other")
+      with summary_ops_v2.always_record_summaries(), writer3.as_default():
+        summary3 = summary_ops_v2.scalar("three", 3.0, step=3)
+      sess.run(writer3.init())
+      sess.run(summary3)
+      sess.run(writer3.flush())
+
+      # Next uses a second session, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = summary_ops_v2.create_file_writer(logdir=logdir)
+        with summary_ops_v2.always_record_summaries(), writer4.as_default():
+          summary4 = summary_ops_v2.scalar("four", 4.0, step=4)
+        other_sess.run(writer4.init())
+        other_sess.run(summary4)
+        other_sess.run(writer4.flush())
+
+        # Next via FileWriter(session=?) uses same second session, should be in
+        # same separate file. (This checks sharing in the other direction)
+        writer5 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer5.add_summary(self._createTaggedSummary("five"), 5)
+        writer5.flush()
+
+      # One more via create_file_writer(), should use same file
+      writer6 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer6.as_default():
+        summary6 = summary_ops_v2.scalar("six", 6.0, step=6)
+      sess.run(writer6.init())
+      sess.run(summary6)
+      sess.run(writer6.flush())
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "six"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("six", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "three"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("three", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Third file should have "four" and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+
+class FileWriterCacheTest(test.TestCase):
+  """FileWriterCache tests."""
 
   def _test_dir(self, test_name):
     """Create an empty dir to use for tests.
@@ -448,32 +622,5 @@ class SummaryWriterCacheTest(test.TestCase):
       self.assertFalse(sw1 == sw2)
 
 
-class ExamplePluginAsset(plugin_asset.PluginAsset):
-  plugin_name = "example"
-
-  def assets(self):
-    return {"foo.txt": "foo!", "bar.txt": "bar!"}
-
-
-class PluginAssetsTest(test.TestCase):
-
-  def testPluginAssetSerialized(self):
-    with ops.Graph().as_default() as g:
-      plugin_asset.get_plugin_asset(ExamplePluginAsset)
-
-      logdir = self.get_temp_dir()
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
-    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
-
-    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "foo!")
-
-    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "bar!")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 6e39ce8c808a1716ff9263982e99a14592472c76..84d20f8e3628ed28fc408e9100b2999973911a33 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -28,7 +28,7 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = ["//tensorflow/contrib/saved_model:reader"],
 )
 
 py_library(
@@ -41,8 +41,10 @@ py_library(
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:loader",
         "@six_archive//:six",
     ],
 )
@@ -52,14 +54,7 @@ py_binary(
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_model_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-        "@six_archive//:six",
+        ":freeze_graph_lib",
     ],
 )
 
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index bbbe1e8ac5b985b11f2481ddcadedc06ed70a4fb..0b8473742c1b46c49d66a0bdbbd329aff58201ab 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -26,6 +26,11 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.util import nest
 
+
+# Key where the object graph proto is saved in a TensorBundle
+OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
+
+
 # A key indicating a variable's value in an object's checkpointed Tensors
 # (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
@@ -94,12 +99,13 @@ class _CheckpointPosition(object):
 
   def restore(self, checkpointable):
     """Restore this value into `checkpointable`."""
-    if self.bind_object(checkpointable):
-      # This object's correspondence with a checkpointed object is new, so
-      # process deferred restorations for it and its dependencies.
-      restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
-      if restore_ops:
-        self._checkpoint.restore_ops.extend(restore_ops)
+    with ops.init_scope():
+      if self.bind_object(checkpointable):
+        # This object's correspondence with a checkpointed object is new, so
+        # process deferred restorations for it and its dependencies.
+        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        if restore_ops:
+          self._checkpoint.restore_ops.extend(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -409,28 +415,29 @@ class CheckpointableBase(object):
            "Checkpointable._add_variable called to create another with "
            "that name. Variable names must be unique within a Checkpointable "
            "object.") % (name,))
-    if context.executing_eagerly():
-      # If this is a variable with a single Tensor stored in the checkpoint, we
-      # can set that value as an initializer rather than initializing and then
-      # assigning (when executing eagerly). This call returns None if there is
-      # nothing to restore.
-      checkpoint_initializer = self._preload_simple_restoration(
-          name=name, shape=shape)
-    else:
-      checkpoint_initializer = None
-    if (checkpoint_initializer is not None
-        and not (
-            isinstance(initializer, CheckpointInitialValue)
-            and initializer.restore_uid > checkpoint_initializer.restore_uid)):
-      # If multiple Checkpointable objects are "creating" the same variable via
-      # the magic of custom getters, the one with the highest restore UID (the
-      # one called last) has to make the final initializer. If another custom
-      # getter interrupts this process by overwriting the initializer, then
-      # we'll catch that when we call _track_checkpointable. So this is "best
-      # effort" to set the initializer with the highest restore UID.
-      initializer = checkpoint_initializer
-      shape = None
-
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # If this is a variable with a single Tensor stored in the checkpoint,
+        # we can set that value as an initializer rather than initializing and
+        # then assigning (when executing eagerly). This call returns None if
+        # there is nothing to restore.
+        checkpoint_initializer = self._preload_simple_restoration(
+            name=name, shape=shape)
+      else:
+        checkpoint_initializer = None
+      if (checkpoint_initializer is not None
+          and not (
+              isinstance(initializer, CheckpointInitialValue)
+              and (initializer.restore_uid
+                   > checkpoint_initializer.restore_uid))):
+        # If multiple Checkpointable objects are "creating" the same variable
+        # via the magic of custom getters, the one with the highest restore UID
+        # (the one called last) has to make the final initializer. If another
+        # custom getter interrupts this process by overwriting the initializer,
+        # then we'll catch that when we call _track_checkpointable. So this is
+        # "best effort" to set the initializer with the highest restore UID.
+        initializer = checkpoint_initializer
+        shape = None
     new_variable = getter(
         name=name, shape=shape, dtype=dtype, initializer=initializer,
         **kwargs_for_getter)
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py
index 32123f87ef2d12497077ab0e2f7d4d4cad1ec5dd..2c4677a27835950c803775fc021c6791bc02fbfb 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable_utils.py
@@ -17,14 +17,46 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+import collections
 import weakref
 
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable as checkpointable_lib
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import deprecation
 
 
-class _Checkpoint(object):
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+
+
+class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, dtype_map=None):
@@ -72,7 +104,819 @@ class _Checkpoint(object):
         # `node` refers to an `Optimizer`, since only these have slot variables.
         self.slot_restorations.setdefault(
             slot_reference.original_variable_node_id, []).append(
-                checkpointable._SlotVariableRestoration(  # pylint: disable=protected-access
+                checkpointable_lib._SlotVariableRestoration(  # pylint: disable=protected-access
                     optimizer_id=node_index,
                     slot_variable_id=slot_reference.slot_variable_node_id,
                     slot_name=slot_reference.slot_name))
+
+
+# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
+# or consolidating the implementation with get_variable.
+def _default_getter(name, shape, dtype, initializer=None,
+                    partition_info=None, **kwargs):
+  """A pared-down version of get_variable which does not reuse variables."""
+  dtype = dtypes.as_dtype(dtype)
+  shape_object = tensor_shape.as_shape(shape)
+  with ops.init_scope():
+    if initializer is None:
+      initializer, initializing_from_value = (
+          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
+              name=name, shape=shape_object, dtype=dtype))
+    else:
+      initializing_from_value = not callable(initializer)
+    # Same logic as get_variable
+    variable_dtype = dtype.base_dtype
+    if initializing_from_value:
+      if shape is not None:
+        raise ValueError("If initializer is a constant, do not specify shape.")
+      initial_value = initializer
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      def initial_value():
+        return initializer(
+            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        name=name,
+        dtype=variable_dtype,
+        **kwargs
+    )
+
+
+def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+                 initializer=None):
+  """Add a variable to a Checkpointable with no scope influence."""
+  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+      name=name, shape=shape, dtype=dtype,
+      initializer=initializer, getter=_default_getter)
+
+
+def _breadth_first_checkpointable_traversal(root_checkpointable):
+  """Find shortest paths to all variables owned by dependencies of root."""
+  bfs_sorted = []
+  to_visit = collections.deque([root_checkpointable])
+  path_to_root = {root_checkpointable: ()}
+  while to_visit:
+    current_checkpointable = to_visit.popleft()
+    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    bfs_sorted.append(current_checkpointable)
+    for child_checkpointable in (
+        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
+      if child_checkpointable.ref not in path_to_root:
+        path_to_root[child_checkpointable.ref] = (
+            path_to_root[current_checkpointable] + (child_checkpointable,))
+        to_visit.append(child_checkpointable.ref)
+  return bfs_sorted, path_to_root
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(checkpointable.name)
+       for checkpointable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(checkpointable_objects)
+  slot_variables = {}
+  for checkpointable in non_slot_objects:
+    if isinstance(checkpointable, optimizer_lib.Optimizer):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[checkpointable])
+      slot_names = checkpointable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = checkpointable.get_slot(
+                original_variable, slot_name)
+          except AttributeError:
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Checkpointable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(checkpointable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          checkpointable_objects.append(slot_variable)
+          slot_variable_proto = (
+              checkpointable_object_graph_pb2.CheckpointableObjectGraph
+              .CheckpointableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(checkpointable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+def _serialize_checkpointables(
+    checkpointable_objects, node_ids, object_names, slot_variables):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  named_saveables = {}
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
+    object_name = object_names[checkpointable]
+    for name, saveable_factory in (
+        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+      attribute = object_proto.attributes.add()
+      attribute.name = name
+      attribute.checkpoint_key = "%s/%s/%s" % (
+          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+      if callable(saveable_factory):
+        saveable = saveable_factory(name=attribute.checkpoint_key)
+      else:
+        saveable = saveable_factory
+      # Figure out the name-based Saver's name for this variable.
+      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          [saveable], convert_variable_to_tensor=False)
+      attribute.full_name, = saver_dict.keys()
+      named_saveables[attribute.checkpoint_key] = saveable
+
+    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
+      child_proto = object_proto.children.add()
+      child_proto.node_id = node_ids[child.ref]
+      child_proto.local_name = child.name
+
+  return named_saveables, object_graph_proto
+
+
+def _serialize_object_graph(root_checkpointable):
+  """Determine checkpoint keys for variables and build a serialized graph.
+
+  Non-slot variables are keyed based on a shortest path from the root saveable
+  to the object which owns the variable (i.e. the one which called
+  `Checkpointable._add_variable` to create it).
+
+  Slot variables are keyed based on a shortest path to the variable being
+  slotted for, a shortest path to their optimizer, and the slot name.
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose variables (including
+      the variables of dependencies, recursively) should be saved.
+
+  Returns:
+    A tuple of (named_variables, object_graph_proto):
+      named_variables: A dictionary mapping names to variable objects.
+      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
+        the serialized object graph and variable references.
+
+  Raises:
+    ValueError: If there are invalid characters in an optimizer's slot names.
+  """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return _serialize_checkpointables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names,
+      slot_variables=slot_variables)
+
+
+def gather_initializers(root_checkpointable):
+  """Traverse the object graph and find initialization ops.
+
+  Looks for `Checkpointable` objects which are dependencies of
+  `root_checkpointable` and which have an `initializer` property. Includes
+  initializers for slot variables only if the variable they are slotting for and
+  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  saved with a checkpoint).
+
+  Args:
+    root_checkpointable: A `Checkpointable` object to gather initializers for.
+  Returns:
+    A list of initialization ops.
+  """
+  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
+  # to run.
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = {
+      obj: _object_prefix_from_path(path)
+      for obj, path in path_to_root.items()}
+  node_ids = {node: node_id for node_id, node
+              in enumerate(checkpointable_objects)}
+  _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return [c.initializer for c in checkpointable_objects
+          if hasattr(c, "initializer") and c.initializer is not None]
+
+
+class _NoRestoreSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, tensor, name):
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(tensor, "", name)
+    super(_NoRestoreSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    return control_flow_ops.no_op()
+
+
+class _LoadStatus(object):
+  """Abstract base for load status callbacks."""
+
+  @abc.abstractmethod
+  def assert_consumed(self):
+    """Raises an exception unless a non-trivial restoration has completed."""
+    pass
+
+  @abc.abstractmethod
+  def run_restore_ops(self, session=None):
+    """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
+    pass
+
+  @abc.abstractmethod
+  def initialize_or_restore(self, session=None):
+    """Runs restore ops from the checkpoint, or initializes variables."""
+    pass
+
+
+class CheckpointLoadStatus(_LoadStatus):
+  """Checks the status of checkpoint loading and manages restore ops.
+
+  Returned from `Saver.restore`. Since `restore` may defer the loading of values
+  in the checkpoint which don't yet have corresponding Python objects,
+  `CheckpointLoadStatus` provides a callback to verify that checkpoint loading
+  is complete (`assert_consumed`).
+
+  When graph building, `restore` does not run restore ops itself since their
+  creation may be deferred. The `run_restore_ops` method must be called once all
+  Python objects with values to restore have been created and added to the
+  dependency graph (this does not necessarily have to be the whole checkpoint;
+  calling `run_restore_ops` while `assert_consumed` fails is supported and will
+  partially restore the checkpoint).
+
+  See `Saver.restore` for usage examples.
+  """
+
+  def __init__(self, checkpoint, feed_dict):
+    self._checkpoint = checkpoint
+    self._feed_dict = feed_dict
+
+  def assert_consumed(self):
+    """Asserts that all objects in the checkpoint have been created/matched.
+
+    Returns:
+      `self` for chaining.
+    Raises:
+      AssertionError: If there are any Python objects in the dependency graph
+        which have not been restored from this checkpoint or a later `restore`,
+        or if there are any checkpointed values which have not been matched to
+        Python objects.
+    """
+    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
+      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if checkpointable is None:
+        raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
+      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
+        raise AssertionError(
+            "Object not assigned a value from checkpoint: %s" % (node,))
+    if self._checkpoint.slot_restorations:
+      # Sanity check; this collection should be clear if everything has been
+      # restored.
+      raise AssertionError("Unresolved slot restorations: %s" % (
+          self._checkpoint.slot_restorations,))
+    if self._checkpoint.unused_attributes:
+      raise AssertionError(
+          ("Unused attributes in these objects (the attributes exist in the "
+           "checkpoint but not in the objects): %s") % (
+               self._checkpoint.unused_attributes.items(),))
+    return self
+
+  def run_restore_ops(self, session=None):
+    """Run operations to restore objects in the dependency graph."""
+    if context.executing_eagerly():
+      return  # Run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    session.run(self._checkpoint.restore_ops, feed_dict=self._feed_dict)
+
+  def initialize_or_restore(self, session=None):
+    """Alias for `run_restore_ops`.
+
+    This method has a sibling in `InitializationOnlyStatus` which instead
+    initializes variables. That type is returned if no checkpoint is specified
+    in `Saver.restore`.
+
+    Args:
+      session: The session to run restore ops in. If `None`, uses the default
+        session.
+    """
+    self.run_restore_ops(session=session)
+
+
+class InitializationOnlyStatus(_LoadStatus):
+  """Returned from `Saver.restore` when no checkpoint has been specified.
+
+  Objects of this type have the same `assert_consumed` method as
+  `CheckpointLoadStatus`, but it always fails. However,
+  `initialize_or_restore` works on objects of both types, and will
+  initialize variables in `InitializationOnlyStatus` objects or restore them
+  otherwise.
+  """
+
+  def __init__(self, root_checkpointable):
+    self._root_checkpointable = root_checkpointable
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "No checkpoint specified (save_path=None); nothing is being restored.")
+
+  def run_restore_ops(self, session=None):
+    """For consistency with `CheckpointLoadStatus`.
+
+    Use `initialize_or_restore` for initializing if no checkpoint was passed
+    to `Saver.restore` and restoring otherwise.
+
+    Args:
+      session: Not used.
+    """
+    raise AssertionError(
+        "No checkpoint specified, so no restore ops are available "
+        "(save_path=None to Saver.restore).")
+
+  def initialize_or_restore(self, session=None):
+    """Runs initialization ops for variables.
+
+    Only objects which would be saved by `Saver.save` will be initialized. See
+    `gather_initializers` for details.
+
+    This method does nothing when executing eagerly (initializers get run
+    eagerly).
+
+    Args:
+      session: The session to run initialization ops in. If `None`, uses the
+        default session.
+    """
+    if context.executing_eagerly():
+      return  # run eagerly
+    if session is None:
+      session = ops.get_default_session()
+    session.run(gather_initializers(self._root_checkpointable))
+
+
+_DEPRECATED_RESTORE_INSTRUCTIONS = (
+    "Restoring a name-based tf.train.Saver checkpoint using the object-based "
+    "restore API. This mode uses global names to match variables, and so is "
+    "somewhat fragile. It also adds new restore ops to the graph each time it "
+    "is called. Prefer re-encoding training checkpoints in the object-based "
+    "format: run save() on the object-based saver (the same one this message "
+    "is coming from) and use that checkpoint in the future.")
+
+
+class NameBasedSaverStatus(_LoadStatus):
+  """Status for loading a name-based training checkpoint."""
+
+  def __init__(self, object_saver, save_path):
+    self._object_saver = object_saver
+    self._save_path = save_path
+
+  def assert_consumed(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "Restoring a name-based checkpoint. No load status is available.")
+
+  @deprecation.deprecated(
+      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
+  def run_restore_ops(self, session=None):
+    """Load the name-based training checkpoint using a new `tf.train.Saver`."""
+    if session is None and not context.executing_eagerly():
+      session = ops.get_default_session()
+    with ops.device("/cpu:0"):
+      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
+          sess=session, save_path=self._save_path)
+
+  def initialize_or_restore(self, session=None):
+    """Alias for `run_restore_ops`."""
+    self.run_restore_ops(session=session)
+
+
+class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
+  """Pretends to be a session, inserts extra feeds on run()."""
+
+  def __init__(self, session, feed_additions):
+    self._wrapped_session = session
+    self._feed_additions = feed_additions
+
+  def run(self, fetches, feed_dict=None, **kwargs):
+    if feed_dict is None:
+      feed_dict = {}
+    else:
+      feed_dict = feed_dict.copy()
+    feed_dict.update(self._feed_additions)
+    return self._wrapped_session.run(
+        fetches=fetches, feed_dict=feed_dict, **kwargs)
+
+
+def _copy_saver_with_new_var_list(old_saver, new_var_list):
+  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
+  new_saver = saver_lib.Saver(var_list=new_var_list)
+  # TODO(allenl): Move to copying functionality to Saver?
+  # pylint: disable=protected-access
+  new_saver._last_checkpoints = old_saver._last_checkpoints
+  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
+  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
+  # pylint: enable=protected-access
+  return new_saver
+
+
+class CheckpointableSaver(object):
+  """Saves and restores a `Checkpointable` object and its dependencies.
+
+  See `Checkpointable` for details of dependency management. `Saver` wraps
+  `tf.train.Saver` for saving, including extra information about the graph of
+  dependencies between Python objects. When restoring, it uses this information
+  about the save-time dependency graph to more robustly match objects with their
+  checkpointed values. When executing eagerly, it supports restoring variables
+  on object creation (see `Saver.restore`).
+
+  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
+  checkpoint was written. To avoid breaking existing checkpoints when modifying
+  a class, dependency names (the names of attributes to which `Checkpointable`
+  objects are assigned) may not change. These names are local to objects, in
+  contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
+  so allow additional program transformations.
+  """
+
+  def __init__(self, root_checkpointable):
+    """Configure saving.
+
+    Args:
+      root_checkpointable: The root of the object graph to save/restore. This
+        object and all of its dependencies are saved in the checkpoint. When
+        restoring, objects are matched and restored starting from this root.
+    """
+    # Allow passing in a weak reference to avoid reference cycles when
+    # `Checkpointable` objects save themselves.
+    self._root_checkpointable_ref = root_checkpointable
+    if not context.executing_eagerly():
+      with ops.device("/cpu:0"):
+        self._file_prefix_placeholder = constant_op.constant("model")
+    else:
+      self._file_prefix_placeholder = None
+
+    # Op caching for save
+    self._object_graph_feed_tensor = None
+    self._last_save_object_graph = None
+    self._last_save_saver = None
+
+    # Op caching for restore
+    self._last_restore_object_graph = None
+    self._last_restore_checkpoint = None
+
+  @property
+  def _root_checkpointable(self):
+    if isinstance(self._root_checkpointable_ref, weakref.ref):
+      derefed = self._root_checkpointable_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_checkpointable_ref
+
+  def save(self, file_prefix, checkpoint_number=None, session=None):
+    """Save a training checkpoint.
+
+    The saved checkpoint includes variables created by this object and any
+    Checkpointable objects it depends on at the time `Saver.save()` is called.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `checkpoint_number`, if provided.
+      checkpoint_number: An integer variable or Tensor, used to number
+        checkpoints. Typically this value is saved along with other variables in
+        training checkpoints, which will happen automatically if it was created
+        by `root_checkpointable` or one of its dependencies (via
+        `Checkpointable._add_variable`).
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint.
+    """
+    named_variables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    if not context.executing_eagerly():
+      if session is None:
+        session = ops.get_default_session()
+      if self._object_graph_feed_tensor is None:
+        with ops.device("/cpu:0"):
+          self._object_graph_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
+      object_graph_tensor = self._object_graph_feed_tensor
+      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
+    else:
+      session = None
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      feed_additions = None
+    assert checkpointable_lib.OBJECT_GRAPH_PROTO_KEY not in named_variables
+    named_variables[checkpointable_lib.OBJECT_GRAPH_PROTO_KEY] = (
+        _NoRestoreSaveable(
+            tensor=object_graph_tensor,
+            name=checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
+    if (self._last_save_object_graph != graph_proto
+        # When executing eagerly, we need to re-create SaveableObjects each time
+        # save() is called so they pick up new Tensors passed to their
+        # constructors. That means the Saver needs to be copied with a new
+        # var_list.
+        or context.executing_eagerly()):
+      if self._last_save_object_graph is not None:
+        self._last_save_saver = _copy_saver_with_new_var_list(
+            old_saver=self._last_save_saver, new_var_list=named_variables)
+      else:
+        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
+      self._last_save_object_graph = graph_proto
+    with ops.device("/cpu:0"):
+      save_path = self._last_save_saver.save(
+          sess=_SessionWithFeedDictAdditions(
+              session=session, feed_additions=feed_additions),
+          save_path=file_prefix,
+          write_meta_graph=False,
+          global_step=checkpoint_number)
+    return save_path
+
+  def _global_variable_names(self):
+    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
+    named_saveables, graph_proto = _serialize_object_graph(
+        self._root_checkpointable)
+    saver_names = {}
+    for object_proto in graph_proto.nodes:
+      for attribute_proto in object_proto.attributes:
+        saver_names[attribute_proto.full_name] = named_saveables[
+            attribute_proto.checkpoint_key]
+    return saver_names
+
+  def restore(self, save_path):
+    """Restore a training checkpoint.
+
+    Restores `root_checkpointable` and any objects that it tracks
+    (transitive). Either assigns values immediately if variables to restore have
+    been created already, or defers restoration until the variables are
+    created. Dependencies added to the `root_checkpointable` passed to the
+    constructor after this call will be matched if they have a corresponding
+    object in the checkpoint.
+
+    When building a graph, restorations are added to the graph but not run.
+
+    To disallow deferred loading, assert immediately that all checkpointed
+    variables have been matched to variable objects:
+
+    ```python
+    saver = Saver(root)
+    saver.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised unless every object was matched and its
+    variables already exist.
+
+    When graph building, `assert_consumed()` indicates that all of the restore
+    ops which will be created for this checkpoint have been created. They can be
+    run via the `run_restore_ops()` function of the status object:
+
+    ```python
+    saver.restore(path).assert_consumed().run_restore_ops()
+    ```
+
+    If the checkpoint has not been consumed completely, then the list of restore
+    ops will grow as more objects are added to the dependency graph.
+
+    Name-based `tf.train.Saver` checkpoints can be loaded using this
+    method. There is no deferred loading, and names are used to match
+    variables. No restore ops are created/run until `run_restore_ops()` or
+    `initialize_or_restore()` are called on the returned status object, even
+    when executing eagerly. Re-encode name-based checkpoints using this
+    object-based `Saver.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of checkpoint restoration and run initialization/restore ops
+      (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if
+      `save_path` is `None`).
+
+      If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus`
+      object is returned which runs restore ops from a name-based saver.
+    """
+    if save_path is None:
+      return InitializationOnlyStatus(self._root_checkpointable)
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      file_prefix_tensor = self._file_prefix_placeholder
+      file_prefix_feed_dict = {self._file_prefix_placeholder: save_path}
+    else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(save_path)
+      file_prefix_feed_dict = None
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+    try:
+      object_graph_string = reader.get_tensor(
+          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
+    except errors_impl.NotFoundError:
+      # The object graph proto does not exist in this checkpoint. Try again with
+      # name-based saving.
+      return NameBasedSaverStatus(self, save_path)
+
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
+      checkpoint = self._last_restore_checkpoint
+    else:
+      if in_graph_mode:
+        dtype_map = None
+      else:
+        dtype_map = reader.get_variable_to_dtype_map()
+      checkpoint = _CheckpointRestoreCoordinator(
+          object_graph_proto=object_graph_proto,
+          save_path=file_prefix_tensor,
+          dtype_map=dtype_map)
+      if in_graph_mode:
+        if self._last_restore_object_graph is not None:
+          raise NotImplementedError(
+              "Using a single Saver to restore different object graphs is not "
+              "currently supported when graph building. Use a different Saver "
+              "for each object graph (restore ops will be duplicated), or "
+              "file a feature request if this limitation bothers you.")
+        self._last_restore_checkpoint = checkpoint
+        self._last_restore_object_graph = object_graph_proto
+    checkpointable_lib._CheckpointPosition(  # pylint: disable=protected-access
+        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+    load_status = CheckpointLoadStatus(
+        checkpoint, feed_dict=file_prefix_feed_dict)
+    return load_status
+
+
+class Checkpoint(checkpointable_lib.Checkpointable):
+  """A utility class which groups `Checkpointable` objects.
+
+  Accepts arbitrary keyword arguments to its constructor and saves those values
+  with a checkpoint. Maintains a `save_counter` for numbering checkpoints.
+
+  Example usage:
+
+  ```python
+  import tensorflow as tf
+  import tensorflow.contrib.eager as tfe
+  import os
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  root = tfe.Checkpoint(optimizer=optimizer, model=model)
+  root.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  for _ in range(num_training_steps):
+    optimizer.minimize( ... )
+  root.save(file_prefix=checkpoint_prefix)
+  ```
+
+  For more manual control over saving, use `tfe.CheckpointableSaver` directly.
+
+  Attributes:
+    save_counter: Incremented when `save()` is called. Used to number
+      checkpoints.
+  """
+
+  def __init__(self, **kwargs):
+    """Group objects into a training checkpoint.
+
+    Args:
+      **kwargs: Keyword arguments are set as attributes of this object, and are
+        saved with the checkpoint. Attribute values must derive from
+        `CheckpointableBase`.
+    Raises:
+      ValueError: If objects in `kwargs` are not Checkpointable.
+    """
+    super(Checkpoint, self).__init__()
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      if not isinstance(v, checkpointable_lib.CheckpointableBase):
+        raise ValueError(
+            ("`Checkpoint` was expecting an object derived from "
+             "`CheckpointableBase`, got %s.") % (v,))
+      setattr(self, k, v)
+    self._save_counter = None  # Created lazily for restore-on-create.
+    self._saver = CheckpointableSaver(weakref.ref(self))
+
+  def _maybe_create_save_counter(self):
+    """Create a save counter if it does not yet exist."""
+    if self._save_counter is None:
+      # Initialized to 0 and incremented before saving.
+      with ops.device("/cpu:0"):
+        self._save_counter = add_variable(
+            self, name="save_counter", initializer=0, dtype=dtypes.int64)
+
+  @property
+  def save_counter(self):
+    """An integer variable which starts at zero and is incremented on save.
+
+    Used to number checkpoints.
+
+    Returns:
+      The save counter variable.
+    """
+    self._maybe_create_save_counter()
+    return self._save_counter
+
+  def save(self, file_prefix, session=None):
+    """Save a checkpoint. Wraps `tfe.CheckpointableSaver.save`."""
+    in_graph_mode = not context.executing_eagerly()
+    if in_graph_mode:
+      if session is None:
+        session = ops.get_default_session()
+      if self._save_counter is None:
+        # When graph building, if this is a new save counter variable then it
+        # needs to be initialized before assign_add. This is only an issue if
+        # restore() has not been called first.
+        session.run(self.save_counter.initializer)
+    with ops.colocate_with(self.save_counter):
+      assign_op = self.save_counter.assign_add(1)
+    if in_graph_mode:
+      session.run(assign_op)
+    return self._saver.save(
+        file_prefix=file_prefix,
+        checkpoint_number=self.save_counter,
+        session=session)
+
+  def restore(self, save_path):
+    """Restore a checkpoint. Wraps `tfe.CheckpointableSaver.restore`."""
+    status = self._saver.restore(save_path=save_path)
+    # Create the save counter now so it gets initialized with other variables
+    # when graph building. Creating it earlier would lead to double
+    # initialization when executing eagerly.
+    self._maybe_create_save_counter()
+    return status
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..29fcdb70b418b08f543d75ff643b82d2a456754d
--- /dev/null
+++ b/tensorflow/python/training/checkpointable_utils_test.py
@@ -0,0 +1,1305 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import sequential
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class InterfaceTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testAddVariable(self):
+    obj = NonLayerCheckpointable()
+    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+      checkpointable_utils.add_variable(
+          obj, name="shape_specified_twice", shape=[], initializer=1)
+    constant_initializer = checkpointable_utils.add_variable(
+        obj, name="constant_initializer", initializer=1)
+    with variable_scope.variable_scope("some_variable_scope"):
+      ones_initializer = checkpointable_utils.add_variable(
+          obj,
+          name="ones_initializer",
+          shape=[2],
+          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
+    bare_initializer = checkpointable_utils.add_variable(
+        obj,
+        name="bare_initializer",
+        shape=[2, 2],
+        dtype=dtypes.float64,
+        initializer=init_ops.zeros_initializer)
+
+    # Even in graph mode, there are no naming conflicts between objects, only
+    # naming conflicts within an object.
+    other_duplicate = resource_variable_ops.ResourceVariable(
+        name="duplicate", initial_value=1.)
+    duplicate = checkpointable_utils.add_variable(
+        obj, name="duplicate", shape=[])
+    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
+      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.assertEqual("constant_initializer:0", constant_initializer.name)
+    self.assertEqual(1, self.evaluate(constant_initializer))
+    self.assertEqual("some_variable_scope/ones_initializer:0",
+                     ones_initializer.name)
+    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
+    self.assertAllEqual([[0., 0.],
+                         [0., 0.]], self.evaluate(bare_initializer))
+    self.assertEqual("a_variable:0", obj.a_variable.name)
+    self.assertEqual("duplicate:0", other_duplicate.name)
+    if context.executing_eagerly():
+      # When executing eagerly, there's no uniquification of variable names. The
+      # checkpoint name will be the same.
+      self.assertEqual("duplicate:0", duplicate.name)
+    else:
+      # The .name attribute may be globally influenced, but the checkpoint name
+      # won't be (tested below).
+      self.assertEqual("duplicate_1:0", duplicate.name)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
+    expected_checkpoint_names = (
+        "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
+        "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+        "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
+        "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
+    )
+    six.assertCountEqual(
+        self, expected_checkpoint_names, named_variables.keys())
+
+  def testInitNotCalled(self):
+
+    class NoInit(checkpointable.Checkpointable):
+
+      def __init__(self):
+        pass
+
+    # __init__ for Checkpointable will be called implicitly.
+    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+
+  def testShapeDtype(self):
+    root = checkpointable.Checkpointable()
+    v1 = checkpointable_utils.add_variable(
+        root, name="v1", initializer=3., dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v1.dtype)
+    v2 = checkpointable_utils.add_variable(
+        root,
+        name="v2",
+        shape=[3],
+        initializer=init_ops.ones_initializer,
+        dtype=dtypes.float64)
+    self.assertEqual(dtypes.float64, v2.dtype)
+    self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
+
+
+class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    tensor = self._primary_variable.read_value()
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name)
+    super(_MirroringSaveable, self).__init__(
+        tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturned(self):
+    v = _OwnsMirroredVariables()
+    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(43.))
+    self.evaluate(v.mirrored.assign(44.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(42., self.evaluate(v.non_dep_variable))
+    self.assertEqual(42., self.evaluate(v.mirrored))
+    self.evaluate(v.non_dep_variable.assign(44.))
+    save_path = checkpoint.save(prefix)
+    self.evaluate(v.non_dep_variable.assign(45.))
+    checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
+    self.assertEqual(44., self.evaluate(v.non_dep_variable))
+    self.assertEqual(44., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMoreComplexSaveableReturnedWithGlobalName(self):
+    # The same object can also be saved using the name-based saver.
+    v = _OwnsMirroredVariables()
+    saver = saver_lib.Saver(var_list=[v])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    self.evaluate(v.non_dep_variable.assign(42.))
+    with self.test_session() as sess:
+      save_path = saver.save(sess, prefix)
+      self.evaluate(v.non_dep_variable.assign(43.))
+      self.evaluate(v.mirrored.assign(44.))
+      saver.restore(sess, save_path)
+      self.assertEqual(42., self.evaluate(v.non_dep_variable))
+      self.assertEqual(42., self.evaluate(v.mirrored))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(saver_lib.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = saver_lib.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableNameEscaping(self):
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    self.assertEqual(r"a.Sb.Sc" + suffix, self._get_checkpoint_name(r"a/b/c"))
+    self.assertEqual(r"b" + suffix, self._get_checkpoint_name(r"b"))
+    self.assertEqual(r"c.S" + suffix, self._get_checkpoint_name(r"c/"))
+    self.assertEqual(r"d.S..S" + suffix, self._get_checkpoint_name(r"d/.S"))
+    self.assertEqual(r"d.S..ATTRIBUTES.Sf" + suffix,
+                     self._get_checkpoint_name(r"d/.ATTRIBUTES/f"))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNumberedPath(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    root.leaf = leaf
+    checkpointable_utils.add_variable(leaf, name="v", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    variable_name, = named_variables.keys()
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLocalNameValidation(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    # Dots are escaped, which avoids conflicts with reserved names.
+    root._track_checkpointable(leaf, name=".ATTRIBUTES")
+    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    name, = named_variables.keys()
+    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLateDependencyTracking(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class LateDependencies(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        self.dep = Dependency()
+        self.dep.build()
+
+    original = LateDependencies()
+    original.add_dep()
+    self.evaluate(state_ops.assign(original.dep.var, 123.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(
+        original).save(checkpoint_prefix)
+    load_into = LateDependencies()
+    status = checkpointable_utils.CheckpointableSaver(
+        load_into).restore(save_path)
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    load_into.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(123., self.evaluate(load_into.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDepAfterVar(self):
+
+    class Dependency(checkpointable.Checkpointable):
+
+      def build(self):
+        self.var = checkpointable_utils.add_variable(
+            self, "var", initializer=0.)
+
+    class DepAfterVar(checkpointable.Checkpointable):
+
+      def add_dep(self):
+        dep = Dependency()
+        dep.build()
+        self.dep = dep
+
+    dep_after_var = DepAfterVar()
+    dep_after_var.add_dep()
+    self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
+        checkpoint_prefix)
+
+    loaded_dep_after_var = DepAfterVar()
+    status = checkpointable_utils.CheckpointableSaver(
+        loaded_dep_after_var).restore(save_path)
+    loaded_dep_after_var.add_dep()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOverlappingRestores(self):
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep = checkpointable.Checkpointable()
+    save_root.dep.var = checkpointable_utils.add_variable(
+        save_root.dep, name="var", initializer=0.)
+    self.evaluate(state_ops.assign(save_root.dep.var, 12.))
+    saver = checkpointable_utils.CheckpointableSaver(save_root)
+    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    self.evaluate(state_ops.assign(save_root.dep.var, 13.))
+    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(13., self.evaluate(load_dep.var))
+
+    # Try again with the order of the restore() reversed. The last restore
+    # determines the final value.
+    first_root = checkpointable.Checkpointable()
+    second_root = checkpointable.Checkpointable()
+    second_status = checkpointable_utils.CheckpointableSaver(
+        second_root).restore(second_path)
+    first_status = checkpointable_utils.CheckpointableSaver(
+        first_root).restore(first_path)
+    load_dep = checkpointable.Checkpointable()
+    load_dep.var = checkpointable_utils.add_variable(
+        load_dep, name="var", shape=[])
+    first_root.dep = load_dep
+    first_status.assert_consumed()
+    first_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+    second_root.dep = load_dep
+    second_status.assert_consumed()
+    second_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(load_dep.var))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAmbiguousLoad(self):
+    # Not OK to split one checkpoint object into two
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    dep_three = checkpointable.Checkpointable()
+    save_root.dep_one.dep_three = dep_three
+    save_root.dep_two.dep_three = dep_three
+    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    checkpointable_utils.CheckpointableSaver(load_root).restore(save_path)
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = checkpointable.Checkpointable()
+    load_root.dep_one.dep_three = checkpointable.Checkpointable()
+    with self.assertRaisesRegexp(AssertionError,
+                                 "resolved to different objects"):
+      load_root.dep_two.dep_three = checkpointable.Checkpointable()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testObjectsCombined(self):
+    # Currently fine to load two checkpoint objects into one Python object
+    checkpoint_directory = self.get_temp_dir()
+    save_root = checkpointable.Checkpointable()
+    save_root.dep_one = checkpointable.Checkpointable()
+    save_root.dep_two = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
+    checkpointable_utils.add_variable(
+        save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
+    self.evaluate(checkpointable_utils.gather_initializers(save_root))
+    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+    load_root = checkpointable.Checkpointable()
+    load_root.dep_one = checkpointable.Checkpointable()
+    load_root.dep_two = load_root.dep_one
+    v1 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
+    v2 = checkpointable_utils.add_variable(
+        load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
+    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(32., self.evaluate(v1))
+    self.assertEqual(64., self.evaluate(v2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDependencyLoop(self):
+    # Note: this test creates garbage during eager execution because it
+    # purposefully creates a reference cycle.
+    first = checkpointable.Checkpointable()
+    second = checkpointable.Checkpointable()
+    first.second = second
+    second.first = first
+    first.v = checkpointable_utils.add_variable(
+        first, "v1", initializer=[3., 1., 4.])
+    second.v = checkpointable_utils.add_variable(
+        second, "v2", initializer=[1., 1., 2., 3.])
+    self.evaluate(checkpointable_utils.gather_initializers(first))
+    checkpoint_directory = self.get_temp_dir()
+    save_path = checkpointable_utils.CheckpointableSaver(first).save(
+        os.path.join(checkpoint_directory, "ckpt"))
+
+    # Test deferred loading
+    first_load = checkpointable.Checkpointable()
+    status = checkpointable_utils.CheckpointableSaver(
+        first_load).restore(save_path)
+    second_load = checkpointable.Checkpointable()
+    first_load.second = second_load
+    second_load.first = first_load
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    first_load.v = checkpointable_utils.add_variable(
+        first_load, "v1", shape=[3])
+    second_load.v = checkpointable_utils.add_variable(
+        second_load, "v2", shape=[4])
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+    # Test loading when variables have already been created
+    self.evaluate(first_load.v.assign([2., 7., 1.]))
+    self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
+    self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
+    self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
+    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
+        save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
+    self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRestoreOnAssign(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(save_graph):
+      first = checkpointable.Checkpointable()
+      first.var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      first.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      self.evaluate(first.var1.assign(4.))
+      self.evaluate(first.var2.assign(8.))
+      save_path = checkpointable_utils.CheckpointableSaver(first).save(
+          checkpoint_prefix)
+    restore_graph = ops.Graph()
+    with restore_graph.as_default(), self.test_session(restore_graph):
+      second = checkpointable.Checkpointable()
+      second.var2 = variable_scope.get_variable(
+          name="blah", initializer=0.)
+      status = checkpointable_utils.CheckpointableSaver(
+          second).restore(save_path)
+      recreated_var1 = variable_scope.get_variable(
+          name="outside_var", initializer=0.)
+      status.run_restore_ops()
+      self.assertEqual(8., self.evaluate(second.var2))
+      self.evaluate(recreated_var1.assign(-2.))
+      self.assertEqual(-2., self.evaluate(recreated_var1))
+      second.var1 = recreated_var1
+      status.run_restore_ops()
+      self.assertEqual(4., self.evaluate(recreated_var1))
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanup(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    saver = checkpointable_utils.Checkpoint(obj=obj)
+    for _ in range(10):
+      saver.save(checkpoint_prefix)
+    expected_filenames = ["checkpoint"]
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCheckpointCleanupChangingVarList(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    obj = checkpointable.Checkpointable()
+    obj.var = variable_scope.get_variable(name="v", initializer=0.)
+    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    looped_variables = []
+    for iteration in range(10):
+      new_variable = resource_variable_ops.ResourceVariable(iteration)
+      self.evaluate(new_variable.initializer)
+      setattr(checkpoint, "var_%d" % iteration, new_variable)
+      checkpoint.save(checkpoint_prefix)
+      looped_variables.append(new_variable)
+    expected_filenames = ["checkpoint"]
+    # We've copied the saver each time, but checkpoint management should still
+    # be consistent.
+    for checkpoint_number in range(6, 11):
+      expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
+      expected_filenames.append(
+          "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
+    six.assertCountEqual(
+        self,
+        expected_filenames,
+        os.listdir(checkpoint_directory))
+    for v in looped_variables:
+      self.evaluate(v.assign(314))
+    checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
+    self.assertEqual(314, self.evaluate(checkpoint.var_9))
+    self.assertEqual(314, self.evaluate(checkpoint.var_8))
+    self.assertEqual(314, self.evaluate(checkpoint.var_6))
+    self.assertEqual(5, self.evaluate(checkpoint.var_5))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    if context.executing_eagerly():
+      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+      self.assertEqual(9, self.evaluate(checkpoint.var_9))
+      self.assertEqual(8, self.evaluate(checkpoint.var_8))
+      self.assertEqual(1, self.evaluate(checkpoint.var_1))
+      self.assertEqual(0, self.evaluate(checkpoint.var_0))
+    else:
+      # Restoring into modified graphs is an error while graph building.
+      with self.assertRaises(NotImplementedError):
+        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_sequential(constant_op.constant([[1.]]))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer())
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+    v1_save, _, v2_save = save_template()
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore_nested(self):
+
+    def _inner_template():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer())
+      return v
+
+    def _outer_template():
+      first_inner = template.make_template("i1", _inner_template)
+      second_inner = template.make_template("i2", _inner_template)
+      v1 = first_inner()
+      v2 = second_inner()
+      v3 = second_inner()
+      return (first_inner, second_inner), (v1, v2, v3)
+
+    with variable_scope.variable_scope("ignored"):
+      save_template = template.make_template("s1", _outer_template)
+      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      (inner_template_one, inner_template_two), _ = save_template()
+    self.evaluate(inner_template_one.variables[0].assign([20.]))
+    self.evaluate(inner_template_two.variables[0].assign([25.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _outer_template)
+    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    status = load_root.restore(save_path)
+    (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
+    outer_template_dependencies = load_root.my_template._checkpoint_dependencies
+    self.assertEqual(2, len(outer_template_dependencies))
+    self.assertEqual("i1", outer_template_dependencies[0].name)
+    self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
+    self.assertEqual("i2", outer_template_dependencies[1].name)
+    self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
+    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
+    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
+    self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([20.], self.evaluate(v1))
+    self.assertAllEqual([25.], self.evaluate(v2))
+    self.assertAllEqual([25.], self.evaluate(v3))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        object_saver = checkpointable_utils.CheckpointableSaver(root)
+        save_path = object_saver.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 297a8bbde5447cff9465be36c0bb71f2490c60fc..7bd57ad3d854534e196fa7b72bebbd7195e6bca8 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 4ce6f6d00267410626f7d7a9e2251d3f40b6bb6e..f584a009d946a193f1ab76b3030db4f8a4954d27 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -202,7 +202,7 @@ class Scaffold(object):
     if self._local_init_op is None:
       self._local_init_op = Scaffold.get_or_default(
           'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
-          Scaffold._default_local_init_op)
+          Scaffold.default_local_init_op)
     if self._summary_op is None:
       self._summary_op = Scaffold.get_or_default('summary_op',
                                                  ops.GraphKeys.SUMMARY_OP,
@@ -267,7 +267,17 @@ class Scaffold(object):
     return op
 
   @staticmethod
-  def _default_local_init_op():
+  def default_local_init_op():
+    """Returns an op that groups the default local init ops.
+
+    This op is used during session initialization when a Scaffold is
+    initialized without specifying the local_init_op arg. It includes
+    `tf.local_variables_initializer`, `tf.tables_initializer`, and also
+    initializes local session resources.
+
+    Returns:
+      The default Scaffold local init op.
+    """
     return control_flow_ops.group(
         variables.local_variables_initializer(),
         lookup_ops.tables_initializer(),
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 75665fc2840797dd53dd863721d2744cb1b08af5..f126d3847b6b0b43495c63b31ca915c107ede969 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -689,9 +689,7 @@ class Optimizer(
       # device_policy is set because non-mirrored tensors will be read in
       # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
       # is an example.
-      with ops.name_scope(
-          "update_" + scope_name), context.context().device_policy(
-              context.DEVICE_PLACEMENT_SILENT):
+      with ops.name_scope("update_" + scope_name):
         return p.update_op(self, g)
 
     with ops.name_scope(name, self._name) as name:
@@ -707,11 +705,8 @@ class Optimizer(
         return self._finish(update_ops, "update")
 
       non_slot_devices = distribution.non_slot_devices(var_list)
-      # Device policy is needed because hyperparameter tensors (such as
-      # AdamOptimizer's beta1_t) need to be copied across devices in Eager.
-      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, self, update_ops)
+      finish_updates = distribution.update_non_slot(
+          non_slot_devices, finish, self, update_ops)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
@@ -823,13 +818,13 @@ class Optimizer(
           if restored_initial_value is not None:
             initial_value = restored_initial_value
         v = variable_scope.variable(initial_value, name=name, trainable=False)
-        # Restore this variable by name if necessary, but don't add a
-        # Checkpointable dependency. Optimizers return the current graph's
-        # non-slot variables from _checkpoint_dependencies explicitly rather
-        # than unconditionally adding dependencies (since there may be multiple
-        # non-slot variables with the same name in different graphs, trying to
-        # save all of them would result in errors).
-        self._handle_deferred_dependencies(name=name, checkpointable=v)
+      # Restore this variable by name if necessary, but don't add a
+      # Checkpointable dependency. Optimizers return the current graph's
+      # non-slot variables from _checkpoint_dependencies explicitly rather
+      # than unconditionally adding dependencies (since there may be multiple
+      # non-slot variables with the same name in different graphs, trying to
+      # save all of them would result in errors).
+      self._handle_deferred_dependencies(name=name, checkpointable=v)
       self._non_slot_dict[key] = v
 
     return v
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index e40b8d22ed2ab0f4c9ff65e953f0f1cf681c8068..79d278cf904fca4781d9bf7f4fc2cf67e3810a72 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import os.path
 import re
+import sys
 import time
 import uuid
 
@@ -30,8 +31,10 @@ import six
 
 from google.protobuf import text_format
 
+from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -1340,6 +1343,9 @@ class Saver(object):
       self._check_saver_def()
       self._write_version = self.saver_def.version
     self._save_relative_paths = save_relative_paths
+    # For compatibility with object-based checkpoints, we may build a second
+    # Saver to read the renamed keys.
+    self._object_restore_saver = None
 
   def build(self):
     if context.executing_eagerly():
@@ -1795,11 +1801,65 @@ class Saver(object):
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
     logging.info("Restoring parameters from %s", save_path)
-    if context.executing_eagerly():
-      self._build_eager(save_path, build_save=False, build_restore=True)
-    else:
-      sess.run(self.saver_def.restore_op_name,
-               {self.saver_def.filename_tensor_name: save_path})
+    try:
+      if context.executing_eagerly():
+        self._build_eager(save_path, build_save=False, build_restore=True)
+      else:
+        sess.run(self.saver_def.restore_op_name,
+                 {self.saver_def.filename_tensor_name: save_path})
+    except errors.NotFoundError:
+      exception_type, exception_value, exception_traceback = sys.exc_info()
+      # The checkpoint would not be loaded successfully as is. Try to parse it
+      # as an object-based checkpoint.
+      try:
+        reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+        object_graph_string = reader.get_tensor(
+            checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      except errors.NotFoundError:
+        # This is not an object-based checkpoint, or the checkpoint doesn't
+        # exist. Re-raise the original exception.
+        six.reraise(exception_type, exception_value, exception_traceback)
+      del exception_traceback  # avoid reference cycles
+
+      # This is an object-based checkpoint. We'll print a warning and then do
+      # the restore.
+      logging.warning(
+          # TODO(allenl): Modify instructions for using the object-based saver
+          # once that's in core.
+          "Restoring an object-based checkpoint using a name-based saver. This "
+          "may be somewhat fragile, and will re-build the Saver. Instead, "
+          "consider loading object-based checkpoints using "
+          "tf.contrib.eager.Checkpoint().")
+      self._restore_from_object_based_checkpoint(
+          sess=sess, save_path=save_path,
+          object_graph_string=object_graph_string)
+
+  def _restore_from_object_based_checkpoint(self, sess, save_path,
+                                            object_graph_string):
+    """A compatibility mode for reading object-based checkpoints."""
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+    object_graph_proto.ParseFromString(object_graph_string)
+    names_to_keys = {}
+    for node in object_graph_proto.nodes:
+      for attribute in node.attributes:
+        names_to_keys[attribute.full_name] = attribute.checkpoint_key
+    saveables = self._builder._ValidateAndSliceInputs(self._var_list)  # pylint: disable=protected-access
+    for saveable in saveables:
+      for spec in saveable.specs:
+        if spec.name not in names_to_keys:
+          raise errors.NotFoundError(
+              None, None,
+              message=("Attempting to load an object-based checkpoint using "
+                       "variable names, but could not find %s in the "
+                       "checkpoint.") % spec.name)
+        spec.name = names_to_keys[spec.name]
+    if self._object_restore_saver is None:
+      # Cache the Saver so multiple restore() calls don't pollute the graph when
+      # graph building. This assumes keys are consistent (i.e. this is the same
+      # type of object-based checkpoint we saw previously).
+      self._object_restore_saver = Saver(saveables)
+    self._object_restore_saver.restore(sess=sess, save_path=save_path)
 
   @staticmethod
   def _add_collection_def(meta_graph_def, key, export_scope=None):
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 14dda7997948ead7b12dee953a0b2ee3b2ee8fc9..3867c0d8daae816839e25a28145c53ea9c869654 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 import math
 import os
 import random
@@ -50,6 +51,8 @@ from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -69,10 +72,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
 
@@ -2948,6 +2953,29 @@ class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
     return self.non_dep_variable.name
 
 
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
 @test_util.with_c_api
 class CheckpointableCompatibilityTests(test.TestCase):
 
@@ -3011,6 +3039,128 @@ class CheckpointableCompatibilityTests(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual(1, v.eval_count)
 
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def testVariableNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    b = resource_variable_ops.ResourceVariable(1., name="b")
+    a_saver = saver_module.Saver([a])
+    b_saver = saver_module.Saver([b])
+    with self.test_session() as sess:
+      sess.run(a.initializer)
+      save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
+      with self.assertRaisesRegexp(
+          errors.NotFoundError, "Key b not found in checkpoint"):
+        b_saver.restore(sess=sess, save_path=save_path)
+
+  def testCheckpointNotFoundErrorRaised(self):
+    # Restore does some tricky exception handling to figure out if it should
+    # load an object-based checkpoint. Tests that the exception handling isn't
+    # too broad.
+    a = resource_variable_ops.ResourceVariable(1., name="a")
+    saver = saver_module.Saver([a])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(
+          errors.NotFoundError,
+          "Failed to find any matching files for path_which_does_not_exist"):
+        saver.restore(sess=sess, save_path="path_which_does_not_exist")
+
+  def testLoadFromObjectBasedGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+      # An incompatible object-based checkpoint to check error messages
+      var = resource_variable_ops.ResourceVariable(1., name="a")
+      self.evaluate(var.initializer)
+      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_path = second_saver.save(file_prefix=os.path.join(
+          checkpoint_directory, "second"))
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as sess:
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver()
+      saver.restore(sess=sess, save_path=save_path)
+      self._check_sentinels(root)
+      before_second_restore_ops = restore_graph.get_operations()
+      # Test that multiple restores do not pollute the graph
+      saver.restore(sess=sess, save_path=save_path)
+      self.assertEqual(before_second_restore_ops,
+                       restore_graph.get_operations())
+      with self.assertRaisesRegexp(errors.NotFoundError,
+                                   "could not find a_variable"):
+        saver.restore(sess=sess, save_path=second_path)
+
+  def testLoadFromObjectBasedEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      saver = saver_module.Saver(
+          root.model.variables + root.optimizer.variables())
+      saver.restore(sess=None, save_path=save_path)
+      self._check_sentinels(root)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1913fc20ee0212b3d9588828fe4da4ba7ebca030..80fc9ff2926c53b557a7ba9e242d597a89acf79f 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -33,6 +33,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -45,6 +46,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/compiler/xla:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 3fd927528921d78b916b2de7e7d4820443efa818..b3f956369d98c05c394890349df9e58e4222ae1b 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -169,11 +169,34 @@ static port::ThreadPool* GetCudaThreadpool() {
     }                                                              \
   } __name;
 
+#define PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)        \
+  struct WrapperShim__##__name {                                         \
+    template <typename... Args>                                          \
+    cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
+        SHARED_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {                  \
+      CHECK_NOTNULL(s);                                                  \
+      CHECK_EQ(s, dnn->GetCurrentDnnStream())                            \
+          << "Stream is not set correctly!";                             \
+      cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()}; \
+      cudnnStatus_t retval = ::__name(args...);                          \
+      return retval;                                                     \
+    }                                                                    \
+  } __name;
+
+// Handles cudnnSetStream differently in order to add debug information.
+struct WrapperShim__cudnnSetStream {
+  cudnnStatus_t operator()(CudnnSupport* dnn, Stream* stream,
+                           cudnnHandle_t handle)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {
+    dnn->SetCurrentDnnStream(stream);
+    cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()};
+    cudnnStatus_t retval = ::cudnnSetStream(handle, AsCUDAStreamValue(stream));
+    return retval;
+  }
+} cudnnSetStream;
+
 // clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnBatchNormalizationBackward)                \
-  __macro(cudnnBatchNormalizationForwardInference)        \
-  __macro(cudnnBatchNormalizationForwardTraining)         \
   __macro(cudnnGetConvolutionNdForwardOutputDim)          \
   __macro(cudnnGetConvolutionForwardAlgorithm)            \
   __macro(cudnnCreateTensorDescriptor)                    \
@@ -190,16 +213,25 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnDestroyConvolutionDescriptor)              \
   __macro(cudnnCreate)                                    \
   __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
   __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
   __macro(cudnnSetConvolutionNdDescriptor)                \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensorNdDescriptor)                     \
-  __macro(cudnnSetFilterNdDescriptor)                     \
+  __macro(cudnnSetFilterNdDescriptor)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(__macro)       \
+  __macro(cudnnBatchNormalizationBackward)                \
+  __macro(cudnnBatchNormalizationForwardInference)        \
+  __macro(cudnnBatchNormalizationForwardTraining)         \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnTransformTensor)                           \
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnLRNCrossChannelForward)                    \
@@ -207,9 +239,11 @@ static port::ThreadPool* GetCudaThreadpool() {
   __macro(cudnnAddTensor)                                 \
   __macro(cudnnConvolutionBackwardData)                   \
   __macro(cudnnConvolutionBackwardFilter)
-// clang-format on
 
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@@ -225,14 +259,15 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // APIs in R3 but not in R5
 // clang-format off
 #if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R3(__macro)                    \
+#define CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(__macro)        \
   __macro(cudnnAddTensor_v3)                                  \
   __macro(cudnnConvolutionBackwardData_v3)                    \
   __macro(cudnnConvolutionBackwardFilter_v3)
 // clang-format on
 
-CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R3
+CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
 #endif
 
 // APIs in R5
@@ -254,29 +289,44 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnGetRNNTrainingReserveSize)                     \
   __macro(cudnnGetRNNLinLayerMatrixParams)                    \
   __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
-
 CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R5
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(__macro)        \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
 #endif
 
 // APIs in R6
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnConvolutionBiasActivationForward)              \
   __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
+
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(__macro)        \
+  __macro(cudnnConvolutionBiasActivationForward)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
+    PERFTOOLS_GPUTOOLS_CUDNN_WRAP_WITH_CHECKED_STREAM)
+#undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
 #endif
 
 // APIs in R7
@@ -291,13 +341,10 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R7
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-
 }  // namespace wrap
 
 namespace {
 
-// Forward declaration.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 
 cudnnHandle_t ToHandle(void* opaque_handle) {
@@ -420,7 +467,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
-    : parent_(parent), dnn_handle_(nullptr) {}
+    : parent_(parent), dnn_handle_(nullptr), current_dnn_stream_(nullptr) {}
 
 CudnnSupport::~CudnnSupport() {
   auto status = wrap::cudnnDestroy(parent_, ToHandle(dnn_handle_));
@@ -478,6 +525,14 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
+port::StatusOr<perftools::gputools::dnn::VersionInfo>
+CudnnSupport::GetVersion() {
+  CudnnVersion version;
+  TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
+  return perftools::gputools::dnn::VersionInfo(
+      version.major_version, version.minor_version, version.patch_level);
+}
+
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
 class ScopedTensorDescriptor {
  public:
@@ -1654,6 +1709,12 @@ bool CudnnSupport::DoRnnForwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1714,7 +1775,7 @@ bool CudnnSupport::DoRnnForwardImpl(
   cudnnStatus_t status;
   if (!is_training) {
     status = wrap::cudnnRNNForwardInference(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1727,7 +1788,7 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/);
   } else {
     status = wrap::cudnnRNNForwardTraining(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -1804,6 +1865,12 @@ bool CudnnSupport::DoRnnBackwardImpl(
 
   // check params size
   mutex_lock lock{dnn_handle_mutex_};
+  auto set_stream_status =
+      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
+  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: "
+               << ToString(set_stream_status);
+  }
 
   if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
                              input_desc)) {
@@ -1835,10 +1902,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
   }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
-      parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      model_dims.seq_length /*seqLength*/, output_desc.handles() /*yDesc*/,
-      output_data.opaque() /*y*/, output_desc.handles() /*dyDesc*/,
-      output_backprop_data.opaque() /*dy*/, output_h_desc.handle() /*dhyDesc*/,
+      this, stream, ToHandle(dnn_handle_) /*handle*/,
+      rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
+      output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
+      output_desc.handles() /*dyDesc*/, output_backprop_data.opaque() /*dy*/,
+      output_h_desc.handle() /*dhyDesc*/,
       output_h_backprop_data.opaque() /*dhy*/,
       output_c_desc.handle() /*dcyDesc*/,
       output_c_backprop_data.opaque() /*dcy*/,
@@ -1867,7 +1935,7 @@ bool CudnnSupport::DoRnnBackwardImpl(
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
     status = wrap::cudnnRNNBackwardWeights(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
+        this, stream, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
         input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
@@ -2511,8 +2579,7 @@ bool CudnnSupport::DoConvolveImpl(
                                    GetConvComputeType<T>()};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -2662,7 +2729,7 @@ bool CudnnSupport::DoConvolveImpl(
     }
   }
   status = wrap::cudnnConvolutionForward(
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
@@ -2731,8 +2798,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
       static_cast<cudnnDataType_t>(cudnn_compute_type)};
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   CHECK(status == CUDNN_STATUS_SUCCESS)
       << "failed to set stream for cudnn handle: " << ToString(status);
 
@@ -2798,7 +2864,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_data->opaque() = " << output_data->opaque();
 
   status = wrap::cudnnConvolutionBiasActivationForward(
-      parent_, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
+      this, stream, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
       /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
@@ -3003,8 +3069,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3040,7 +3105,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     }
 
     status = wrap::cudnnBatchNormalizationForwardTraining(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
         batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
@@ -3057,7 +3122,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     const void* maybe_inv_var = estimated_variance.opaque();
 #endif
     status = wrap::cudnnBatchNormalizationForwardInference(
-        parent_, ToHandle(dnn_handle_), mode, &one, &zero,
+        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
         estimated_mean.opaque(), maybe_inv_var, epsilon);
@@ -3108,8 +3173,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3130,7 +3194,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
   float zero = 0.0;
 
   status = wrap::cudnnBatchNormalizationBackward(
-      parent_, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
+      this, stream, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
       x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
       y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
@@ -3320,7 +3384,7 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   float alpha = 1.0f;
   float beta = 0.0f;
   auto status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
       backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
       (*transform_scratch)->mutable_device_memory()->opaque());
 
@@ -3339,8 +3403,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  cudnnStatus_t status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                              AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3351,7 +3414,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   ScopedTensorDescriptor output_tensor_desc(
       parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
+      this, stream, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
       input_data.opaque(), &beta, output_tensor_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3378,8 +3441,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3548,7 +3610,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 #else
   status = wrap::cudnnConvolutionBackwardData_v3(
 #endif
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
       /*alpha=*/alpha,
       /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(),
@@ -3649,8 +3711,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3820,7 +3881,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
 #else
   status = wrap::cudnnConvolutionBackwardFilter_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), /*alpha=*/alpha,
+      this, stream, ToHandle(dnn_handle_), /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
@@ -3916,8 +3977,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
@@ -3932,7 +3992,7 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
   float beta = 0.0;
 
   status = wrap::cudnnConvolutionBackwardBias(
-      parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
       input_data.opaque(), &beta, bias_nd.handle(),
       backward_bias_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4137,8 +4197,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4152,7 +4211,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 #else
   status = wrap::cudnnAddTensor_v3(
 #endif
-      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
+      this, stream, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
       biases.opaque(), &beta, input_descriptor.handle(), output_data->opaque());
 
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4170,8 +4229,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4215,7 +4273,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
   // Beta is the output scaling factor.
   float beta = 0.0;
   status = wrap::cudnnActivationForward(
-      parent_, ToHandle(dnn_handle_),
+      this, stream, ToHandle(dnn_handle_),
 #if CUDNN_VERSION >= 5000
       activation_desc.handle(),
 #else
@@ -4239,8 +4297,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4256,7 +4313,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4274,8 +4331,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4291,7 +4347,7 @@ bool CudnnSupport::DoPoolForward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4309,8 +4365,7 @@ bool CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<Eigen::half>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4325,7 +4380,7 @@ bool CudnnSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingForward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4345,8 +4400,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<double>& input_diff_data,
     DeviceMemory<double>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4362,7 +4416,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_DOUBLE};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4383,8 +4437,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4400,7 +4453,7 @@ bool CudnnSupport::DoPoolBackward(
                                    CUDNN_DATA_FLOAT};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4421,8 +4474,7 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4437,7 +4489,7 @@ bool CudnnSupport::DoPoolBackward(
   ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
   ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
   status = wrap::cudnnPoolingBackward(
-      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
       dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
       input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
       src_desc.handle(), output_diff_data->opaque());
@@ -4472,8 +4524,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
 
   // Launch the normalization.
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4488,7 +4539,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelForward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(), input_data.opaque(),
       &beta, dims.handle(), output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -4515,8 +4566,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   }
 
   mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
+  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -4529,7 +4579,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   float beta = 0.0f;
 
   status = wrap::cudnnLRNCrossChannelBackward(
-      parent_, ToHandle(dnn_handle_), normalize.handle(),
+      this, stream, ToHandle(dnn_handle_), normalize.handle(),
       CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(),
       normalized_data.opaque(), dims.handle(),
       normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index e40ba9b012e7a168a1bf7fdaf726ccbeb4908c52..2e63009f1a5588ffd3847b3348de18326571aeca 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -46,6 +46,7 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
+  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
@@ -624,10 +625,27 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
- private:
-  // Guards the enqueueing of DNN operations via the dnn_handle_ below.
+  const Stream* GetCurrentDnnStream() const
+      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    return current_dnn_stream_;
+  }
+
+  void SetCurrentDnnStream(Stream* stream)
+      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    current_dnn_stream_ = stream;
+  }
+
+  CUDAExecutor* GetParentExecutor() { return parent_; }
+
+  // Guards the enqueueing of DNN operations via the dnn_handle_ below, and
+  // access to current_dnn_stream_.
+  //
+  // This is a public member because we need to add thread safty annotations in
+  // the cudnn wrapper functions in the cc file, which need to access this
+  // mutex (the annotations require C++ permission checks).
   mutex dnn_handle_mutex_;
 
+ private:
   CUDAExecutor* parent_;  // Parent executor object. Not owned.
 
   // cudnn library handle. cudnnHandle_t type is not present in this header to
@@ -635,6 +653,9 @@ class CudnnSupport : public dnn::DnnSupport {
   // single cuda_dnn translation unit.
   void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
 
+  // The current cudnn stream that is set by cudnnSetStream().
+  Stream* current_dnn_stream_ GUARDED_BY(dnn_handle_mutex_);
+
   // NOTE(keveman): Temporary data layout transformation until cuDNN supports
   // kBatchYXDepth for backward pass. This function allocates temporary memory,
   // lays out the source data into the temporary but in the kBatchDepthXY
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 58e1e58c593a3d938d97baff2356bce2c215a7a1..b06be69b64bf69812f349237f3cdebe5a7a5b6b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -37,14 +37,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
-#if defined(PLATFORM_WINDOWS)
-// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
-//  here creates a conflict with cuda.h - for now define it here.
-#define ARRAYSIZE(a) \
-  ((sizeof(a) / sizeof(*(a))) / \
-  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
@@ -720,15 +712,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
         port::bit_cast<void *>(info_log_buffer.data()),
         port::bit_cast<void *>(uintptr_t(log_verbose))};
-    CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
+    CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
     {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
-                               option_values);
+      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
+                               options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 5ecaf46b8cae3c1e1f312816e7e5aec8ff8ce306..58ca0d3a9723a07a6c7cacf964603536fcbd5ce7 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -1127,7 +1127,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_name(device_name);
   }
 
-  for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
+  for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) {
     const auto &params = kAllUnqueryableDeviceParams[i];
     if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) {
       builder.set_blocks_per_core_limit(params.blocks_per_core_limit);
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/cuda/cuda_timer.cc
index 4bd5503348f4dc92a0ce3c18aaf0128174a94121..8532f08725d18f6b81ab500026bfeed68a502f35 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/cuda/cuda_timer.cc
@@ -27,16 +27,18 @@ namespace cuda {
 bool CUDATimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
   CudaContext* context = parent_->cuda_context();
-  if (!CUDADriver::CreateEvent(context, &start_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
+  port::Status status = CUDADriver::CreateEvent(
+      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
     return false;
   }
 
-  if (!CUDADriver::CreateEvent(context, &stop_event_,
-                               CUDADriver::EventFlags::kDefault)
-           .ok()) {
-    port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+  status = CUDADriver::CreateEvent(context, &stop_event_,
+                                   CUDADriver::EventFlags::kDefault);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    status = CUDADriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
@@ -71,16 +73,22 @@ float CUDATimer::GetElapsedMilliseconds() const {
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), start_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Start(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), start_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream *stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), stop_event_,
-                                 stream->cuda_stream())
-      .ok();
+bool CUDATimer::Stop(CUDAStream* stream) {
+  port::Status status = CUDADriver::RecordEvent(
+      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 }  // namespace cuda
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 230adafeb112f682b5ece4778921e18a4ad25f87..42b3dc8cc67b085d7c0d483f2922ef4f19df0092 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
-#include "testing/base/public/gunit.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace perftools {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 43cfd313c113fdbab6bc4166a3cacedb0495eddc..47dcd80218a1de0ef722e68fac882a73412fd3bc 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -875,6 +876,22 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
+// A simple class representing the version of the backing library, to 
+// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// See PR#16309 and issue #18402 for links discussing the issue.
+class VersionInfo {
+ public:
+  VersionInfo(int major = 0, int minor = 0, int patch = 0)
+      : major_(major), minor_(minor), patch_(patch) {}
+  int major_version() { return major_; }
+  int minor_version() { return minor_; }
+  int patch() { return patch_; }
+ private:
+  int major_;
+  int minor_;
+  int patch_;
+};
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
@@ -885,6 +902,12 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
+  // Gets the version of the backing library, as a VersionInfo object.
+  virtual port::StatusOr<VersionInfo> GetVersion() {
+    return port::UnimplementedError(
+        "DnnSupport::GetVersion not implemented on this platform.");
+  }
+
   // Performs a single-precision forward batch normalization operation onto
   // the stream.
   //
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 138738ecab54986fd7d5cd76839d59da55623b1f..3b97929b37d82dc6fc00c70f617c5868b28d547e 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -14,238 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 // IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
-//
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Further, StatusOr<T*> does not allow the
-// contained pointer to be NULL.
-//
-// The primary use-case for StatusOr<T> is as the return value of a
-// function which may fail.
-//
-// Example client usage for a StatusOr<T>, where T is not a pointer:
-//
-//  StatusOr<float> result = DoBigCalculationThatCouldFail();
-//  if (result.ok()) {
-//    float answer = result.ValueOrDie();
-//    printf("Big calculation yielded: %f", answer);
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<T*>:
-//
-//  StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<std::unique_ptr<T>>:
-//
-//  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo = std::move(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example factory implementation returning StatusOr<T*>:
-//
-//  StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) {
-//    if (arg <= 0) {
-//      return Status(port::error::INVALID_ARGUMENT,
-//                            "Arg must be positive");
-//    } else {
-//      return new Foo(arg);
-//    }
-//  }
-//
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
-#include <new>
-#include "tensorflow/stream_executor/platform/port.h"
-#include <type_traits>
-#include <utility>
-
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace perftools {
 namespace gputools {
 namespace port {
 
-template<typename T>
-class StatusOr {
-  template<typename U> friend class StatusOr;
-
- public:
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr() : status_(error::UNKNOWN, "") {}
-
-  // Construct a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to ValueOrDie() is invalid.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: status != Status::OK.
-  // In optimized builds, passing Status::OK here will have the effect
-  // of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const Status& status);  // NOLINT
-
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: if T is a plain pointer, value != NULL.
-  // In optimized builds, passing a NULL pointer here will have
-  // the effect of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other)  // NOLINT
-      : status_(other.status_),
-        value_(other.value_) {}
-
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other) {
-    status_ = other.status_;
-    value_ = other.value_;
-    return *this;
-  }
-
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
-  StatusOr(T&& value);  // NOLINT
-
-  // Move conversion operator to avoid unnecessary copy.
-  // T must be assignable from U.
-  // Not marked with explicit so the implicit conversion can happen.
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : status_(std::move(other.status_)),
-        value_(std::move(other.value_)) {}
-
-  // Move assignment operator to avoid unnecessary copy.
-  // T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(StatusOr<U>&& other) {
-    status_ = std::move(other.status_);
-    value_ = std::move(other.value_);
-    return *this;
-  }
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
-
-  // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
-
-  // Returns a reference to our current value, requires that this->ok().
-  // If you need to initialize a T object from the stored value,
-  // ConsumeValueOrDie() may be more efficient.
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
-
-  // Returns our current value, requires this->ok(). Use this if
-  // you would otherwise want to say std::move(s.ValueOrDie()), for example
-  // if you need to initialize a T object from the stored value and you don't
-  // need subsequent access to the stored value. It uses T's move constructor,
-  // if it has one, so it will work with move-only types, and will often be
-  // more efficient than ValueOrDie, but may leave the stored value
-  // in an arbitrary valid state.
-  T ConsumeValueOrDie();
-
- private:
-  Status status_;
-  T value_;
-
-  void CheckValueNotNull(const T& value);
-
-  template <typename U>
-  struct IsNull {
-    // For non-pointer U, a reference can never be NULL.
-    static inline bool IsValueNull(const U& t) { return false; }
-  };
-
-  template <typename U>
-  struct IsNull<U*> {
-    static inline bool IsValueNull(const U* t) { return t == NULL; }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation details for StatusOr<T>
-
-template <typename T>
-StatusOr<T>::StatusOr(const T& value)
-    : status_(), value_(value) {
-  CheckValueNotNull(value);
-}
-
-template <typename T>
-const T& StatusOr<T>::ValueOrDie() const {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T& StatusOr<T>::ValueOrDie() {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T StatusOr<T>::ConsumeValueOrDie() {
-  TF_CHECK_OK(status_);
-  return std::move(value_);
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(const Status& status)
-    : status_(status) {
-  assert(!status.ok());
-  if (status.ok()) {
-    status_ =
-        Status(error::INTERNAL,
-               "Status::OK is not a valid constructor argument to StatusOr<T>");
-  }
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(T&& value)
-    : status_() {
-  CheckValueNotNull(value);
-  value_ = std::move(value);
-}
-
+// Use XLA's StatusOr so we don't duplicate code.
 template <typename T>
-void StatusOr<T>::CheckValueNotNull(const T& value) {
-  assert(!IsNull<T>::IsValueNull(value));
-  if (IsNull<T>::IsValueNull(value)) {
-    status_ =
-        Status(error::INTERNAL,
-               "NULL is not a valid constructor argument to StatusOr<T*>");
-  }
-}
+using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
index 6603df487878e62271a144b14d78518044c66c81..db621004353f33890037cdcf792f90ee52a1d01c 100644
--- a/tensorflow/stream_executor/platform/port.h
+++ b/tensorflow/stream_executor/platform/port.h
@@ -39,12 +39,6 @@ using tensorflow::uint64;
 using std::string;
 #endif
 
-#if !defined(COMPILER_MSVC)
-#define ARRAYSIZE(a)              \
-    ((sizeof(a) / sizeof(*(a))) / \
-    static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
 using tensorflow::LinkerInitialized;
 using tensorflow::LINKER_INITIALIZED;
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 528f811b40ad7711407c856af804cbe2829d8b32..51e856bed0e995fc6e1e367401c93b5f3648c361 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -163,7 +163,6 @@ def if_override_eigen_strong_inline(a):
 
 def get_win_copts(is_external=False):
     WINDOWS_COPTS = [
-        "/D__VERSION__=\\\"MSVC\\\"",
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
         "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
@@ -1704,7 +1703,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 9f1bdd8aae7f4ef0540070fa20530f24798068bd..a1c569951e99162c8048b7b760c25df7b2f29420 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -32,6 +32,7 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
+        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -117,6 +118,7 @@ genrule(
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
         "api/user_ops/__init__.py",
+        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 70f9776b0846582b8d4e9710879883fdf975a001..c7748f5b7a7c1013f11e053e7a36ddfd9594c6ea 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -67,18 +67,23 @@ def format_import(source_module_name, source_name, dest_name):
       return 'import %s as %s' % (source_name, dest_name)
 
 
-class _ModuleImportsBuilder(object):
+class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
   def __init__(self):
-    self.module_imports = collections.defaultdict(list)
-    self._seen_api_names = set()
+    self.module_imports = collections.defaultdict(
+        lambda: collections.defaultdict(set))
+    self._dest_import_to_id = collections.defaultdict(int)
+    # Names that start with underscore in the root module.
+    self._underscore_names_in_root = []
 
   def add_import(
-      self, dest_module_name, source_module_name, source_name, dest_name):
+      self, symbol_id, dest_module_name, source_module_name, source_name,
+      dest_name):
     """Adds this import to module_imports.
 
     Args:
+      symbol_id: (number) Unique identifier of the symbol to import.
       dest_module_name: (string) Module name to add import to.
       source_module_name: (string) Module to import from.
       source_name: (string) Name of the symbol to import.
@@ -89,34 +94,67 @@ class _ModuleImportsBuilder(object):
         dest_name has already been added to dest_module_name.
     """
     import_str = format_import(source_module_name, source_name, dest_name)
-    if import_str in self.module_imports[dest_module_name]:
-      return
 
     # Check if we are trying to expose two different symbols with same name.
     full_api_name = dest_name
     if dest_module_name:
       full_api_name = dest_module_name + '.' + full_api_name
-    if full_api_name in self._seen_api_names:
+    if (full_api_name in self._dest_import_to_id and
+        symbol_id != self._dest_import_to_id[full_api_name] and
+        symbol_id != -1):
       raise SymbolExposedTwiceError(
           'Trying to export multiple symbols with same name: %s.' %
           full_api_name)
-    self._seen_api_names.add(full_api_name)
+    self._dest_import_to_id[full_api_name] = symbol_id
 
-    self.module_imports[dest_module_name].append(import_str)
+    if not dest_module_name and dest_name.startswith('_'):
+      self._underscore_names_in_root.append(dest_name)
 
+    # The same symbol can be available in multiple modules.
+    # We store all possible ways of importing this symbol and later pick just
+    # one.
+    self.module_imports[dest_module_name][full_api_name].add(import_str)
 
-def get_api_imports():
-  """Get a map from destination module to formatted imports.
+  def build(self):
+    """Get a map from destination module to __init__.py code for that module.
+
+    Returns:
+      A dictionary where
+        key: (string) destination module (for e.g. tf or tf.consts).
+        value: (string) text that should be in __init__.py files for
+          corresponding modules.
+    """
+    module_text_map = {}
+    for dest_module, dest_name_to_imports in self.module_imports.items():
+      # Sort all possible imports for a symbol and pick the first one.
+      imports_list = [
+          sorted(imports)[0]
+          for _, imports in dest_name_to_imports.items()]
+      module_text_map[dest_module] = '\n'.join(sorted(imports_list))
+
+    # Expose exported symbols with underscores in root module
+    # since we import from it using * import.
+    underscore_names_str = ', '.join(
+        '\'%s\'' % name for name in self._underscore_names_in_root)
+    module_text_map[''] += '''
+_names_with_underscore = [%s]
+__all__ = [s for s in dir() if not s.startswith('_')]
+__all__.extend([s for s in _names_with_underscore])
+''' % underscore_names_str
+
+    return module_text_map
+
+
+def get_api_init_text():
+  """Get a map from destination module to __init__.py code for that module.
 
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
-      value: List of strings representing module imports
-          (for e.g. 'from foo import bar') and constant
-          assignments (for e.g. 'FOO = 123').
+      value: (string) text that should be in __init__.py files for
+        corresponding modules.
   """
-  module_imports_builder = _ModuleImportsBuilder()
-  visited_symbols = set()
+  module_code_builder = _ModuleInitCodeBuilder()
 
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
@@ -131,8 +169,6 @@ def get_api_imports():
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
-      if id(attr) in visited_symbols:
-        continue
 
       # If attr is _tf_api_constants attribute, then add the constants.
       if module_contents_name == _API_CONSTANTS_ATTR:
@@ -140,30 +176,25 @@ def get_api_imports():
           for export in exports:
             names = export.split('.')
             dest_module = '.'.join(names[:-1])
-            module_imports_builder.add_import(
-                dest_module, module.__name__, value, names[-1])
+            module_code_builder.add_import(
+                -1, dest_module, module.__name__, value, names[-1])
         continue
 
       _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        # If the same symbol is available using multiple names, only create
-        # imports for it once.
-        if id(attr) in visited_symbols:
-          continue
-        visited_symbols.add(id(attr))
-
         for export in attr._tf_api_names:  # pylint: disable=protected-access
           names = export.split('.')
           dest_module = '.'.join(names[:-1])
-          module_imports_builder.add_import(
-              dest_module, module.__name__, module_contents_name, names[-1])
+          module_code_builder.add_import(
+              id(attr), dest_module, module.__name__, module_contents_name,
+              names[-1])
 
   # Import all required modules in their parent modules.
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
-  imported_modules = set(module_imports_builder.module_imports.keys())
+  imported_modules = set(module_code_builder.module_imports.keys())
   for module in imported_modules:
     if not module:
       continue
@@ -176,11 +207,11 @@ def get_api_imports():
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
         import_from += '.' + parent_module
-      module_imports_builder.add_import(
-          parent_module, import_from, module_split[submodule_index],
-          module_split[submodule_index])
+      module_code_builder.add_import(
+          -1, parent_module, import_from,
+          module_split[submodule_index], module_split[submodule_index])
 
-  return module_imports_builder.module_imports
+  return module_code_builder.build()
 
 
 def create_api_files(output_files):
@@ -196,16 +227,19 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
+    # Convert path separators to '/' for easier parsing below.
+    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
+        normalized_output_file[
+            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
     # Convert / to .
     module_name = module_dir.replace('/', '.').strip('.')
-    module_name_to_file_path[module_name] = output_file
+    module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -213,11 +247,11 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_imports = get_api_imports()
+  module_text_map = get_api_init_text()
 
   # Add imports to output files.
   missing_output_files = []
-  for module, exports in module_imports.items():
+  for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
       module_file_path = '"api/%s/__init__.py"' %  (
@@ -225,7 +259,7 @@ def create_api_files(output_files):
       missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + '\n'.join(exports))
+      fp.write(_GENERATED_FILE_HEADER + text)
 
   if missing_output_files:
     raise ValueError(
@@ -242,6 +276,16 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='Python files that we expect this script to output.')
+      help='If a single file is passed in, then we we assume it contains a '
+      'semicolon-separated list of Python files that we expect this script to '
+      'output. If multiple files are passed in, then we assume output files '
+      'are listed directly as arguments.')
   args = parser.parse_args()
-  main(args.outputs)
+  if len(args.outputs) == 1:
+    # If we only get a single argument, then it must be a file containing
+    # list of outputs.
+    with open(args.outputs[0]) as output_list_file:
+      outputs = [line.strip() for line in output_list_file.read().split(';')]
+  else:
+    outputs = args.outputs
+  main(outputs)
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 2760779e6e0a909cb077f534db40710ab6a11b32..218c8120453c8dca6e81146eb06e8243a3cd424d 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -56,7 +56,7 @@ class CreatePythonApiTest(test.TestCase):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = (
         'from test.tensorflow.test_module import test_op as test_op1')
     self.assertTrue(
@@ -69,14 +69,14 @@ class CreatePythonApiTest(test.TestCase):
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = 'from test.tensorflow.test_module import TestClass'
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
deleted file mode 100644
index 11565bd3e4178202fa82e2e079d1035190dbd6ec..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
+++ /dev/null
@@ -1,65 +0,0 @@
-path: "tensorflow.distributions.bijectors.Bijector"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'event_ndims\', \'graph_parents\', \'is_constant_jacobian\', \'validate_args\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
deleted file mode 100644
index 1e5fe624eb838e188594d03b656c12890db344a1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.distributions.bijectors.Identity"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.identity_bijector.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_ndims"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_parents"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_constant_jacobian"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'validate_args\', \'event_ndims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'identity\'], "
-  }
-  member_method {
-    name: "forward"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
-  }
-  member_method {
-    name: "forward_event_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "forward_event_shape_tensor"
-    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "forward_log_det_jacobian"
-    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
-  }
-  member_method {
-    name: "inverse"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
-  }
-  member_method {
-    name: "inverse_event_shape"
-    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "inverse_event_shape_tensor"
-    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
-  }
-  member_method {
-    name: "inverse_log_det_jacobian"
-    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
deleted file mode 100644
index 1d0144f36ec332740889dc8caa5add8f41960d92..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.distributions.bijectors"
-tf_module {
-  member {
-    name: "Bijector"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-  member {
-    name: "Identity"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
index 2fba7c506ed9d2490e7c19c1746d3f4e9645424f..90b60ef074dd2eaf911291e6c725b98e2891e728 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
@@ -68,10 +68,6 @@ tf_module {
     name: "Uniform"
     mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
   }
-  member {
-    name: "bijectors"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "kl_divergence"
     argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 05e603efb7cbad8c4c42a7a15074d2634af8d21c..c8da55d8021b7659446d0771a089b7b605d86c4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -84,7 +88,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7be2f4f61f6b9637f372591e49efc0c93c7a8c0a..7713d78b8a505d464800ada0c84ca126213d95d0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index bf361cf8054571c0b056e1373acb838aaea87173..69b81f75fa078856b4ed9fccd1037059efd90a0b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index db8f626b98b70fd99f38e696aa16c72e74e86e25..96272d1b7d61430188bbbf2680bd2beb9f1e9675 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 809b3a5430449176a0d7423ec7f4499ceb620890..8fd55c8686de77ec764e9d564c78d0df4f545915 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 68d41bb6cc258ca87d4664ac0fb9d5649f89ebaf..47d1532c3c8cf248f6e9a9a35e10b9559286263f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 970b777e514194db4ac49fe58bea737b35436217..797d422a90a5ad21d0014a0003b11d281c25e579 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 529c64ab293d596012aefd42e0695bd1eb7e44d1..269be1455b6bf3bbe325f3928584960578e3793d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7e7c330d74fe3b71ecd0eb87e34719e47ae70784..344813621534dbb5de3719088c06313e55519dd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index ada8466d7473072b1878861ab36ec40b07fa1914..979008d0edb7d0f9d9c1246e1fcc7d2e2871d28c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 2a5c1cd530a7a532f6cdd3c184f4ee7eb88d23d3..0ffdffd4cdee14fbfeb68f2575300632ab21d7a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 9a2cb29815d59f3761ea25e9ea36ff6489c85b88..6b00f110eea2aaa551bffe8ec225042c5469210e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index f5e991ea42e5ee2723b64574d4598dc8463f1c8c..caff5a2f1db61c6958980446d3bd54009776e1a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 31732214a62524017e39776cdfb9ab629746e8ae..4a7239492177aae2ee098fc3033904d3d1a31ba4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index 422eddf10db6763e10405dba5537ca161d1b8994..9804394fa53d6b3c1ff136a73212863143bbdb39 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 9053a37916314198842bc21b0608a9b69a64c264..5e5b04c7c695c6d88e7f42b77290a582be087763 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -18,10 +17,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -70,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -112,11 +103,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index 3d536d2182fc4480a2ee5fba177543ca21fbd5ac..b8eb4079b9eea3b054f9c2ad4298f6a1669de79e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index a535f1817021f07c29e19fca4b3a7e7e60f59fb1..3fdb101425d0f010f77cd70bc3721af269a36b0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -47,10 +46,6 @@ tf_class {
     name: "filters"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -139,10 +134,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -193,11 +184,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 801a0339720919f8b3f6beee0f045d58b2c0a371..0be42471e35eeba224376b24aa846db69e011274 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 13352e264a5305190717bb973a3f2bce4d7f4fff..39ba31a70942811cbd36caf33c9bda90a5449703 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index f400e4a15c362037e85ac375cee98bb5f6358669..26d9d8c476f4e429ffe112cd490ad277b478c65b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b3a9f573b8ba652d2544b21f36f65fe81a6ebb50..43611017fa37077c7ff05e690dfd50a0e6e5ae1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index a9be09c0abd19aeb4df30116ef2befc3948bfbf4..fa4925ab99d719104c5ca1a0003c25d85f78f3df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index be1ef5eb928d16cc6bf78c289aa20d815c728b23..c5c5d5e7c083dca63972cb7033e675aa483d8039 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 30034f7eaf6d9073695353e5c8d9ead0cc8de7cc..36dc2d2e9a70fe7a1d32352a14dabbade2a2efc2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index 189b38054c004facfeeff8ad2ae87848b89040f2..23ec74370bab010d2b5ba257502a32c8ca7e4a57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a76d85c629c1fe620dafd62a0f0e05e9009109e2..0e4089c5785ccdcafbb8b3dc1ca75ffbe49d9434 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 782195d4ad5883d8c0ea6a657cc10258f2080a55..23ddbe1a925e33432727d13dc875972136083056 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index 2cb7a39ea595e1ff699b96554cb135377d20a488..e04ab6bea85baa1252c9c43f891f9ed5a9dedb87 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 80803306992bba3b601824a93cb3086ef3947369..655314afffd1e4fb0987c913eb69edc4254c77ef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index 678f40bbc23db15ff7c1138169478fb4412a449d..d5215f1330a2fd0db4696a7b743d931e164a5a9f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index fac826109b6a32305ece86c4990f08afe2236ce8..310a3c3b918684803c86d8e2ea1731604a041cdb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index b38716aa2cd9cedd94dbfe1468c3988e419db714..2d67b5f720209e3648e69659d44c3d8c4e639231 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -2,10 +2,8 @@ path: "tensorflow.keras.layers.DepthwiseConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 285d544af2d69d564afdec748598b39b6b95670f..0e493a7f2bffc772e9cc9cd5afdc4c092fb92118 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index b77976974cccb96fc2373c093d2bdf279560c46f..14726b4b6cecc39da98af4220211a1c0351b2ff1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index b07714d3f2d158496e0482f8611e55ea0fb0fd51..32a50455ed8a2a146a81fde0d884ebc867b8d0ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index e67d4ddfc47077d62319ab097e5333a373cbfc80..2f615d81124a7ef5e1bd7181a10abfb1b7a8df24 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index b2a668e5a88d312656f48ddd0e9f7aa9f6306991..82dc878a8c7f7f011df4dd3fa0445217fa250a98 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 4274b8d42552c4421acaee5e18393ea6093acb7d..d79d02b95433a7399f27ff1354cba315f8a2c3ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 8d9f06083cfc271ebacbe714bca0037a94753fd7..1d38ae64bb86d11ecd352371608a11f6736bb0ac 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -126,10 +121,6 @@ tf_class {
     name: "reset_after"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -176,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index f4f1a5d51c5d5689918af4facf907f79d9ca71ec..135de9cd95141a93d1a45b80ada66e339c484c89 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index e502df5e177d422403d0643c18a9588afb9d9713..5db6e433ee02b8050822f76bb762329055c11aa4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 9c8d5bfcd8966384230e7d5cdcc1cac53a0eab9a..bf0dba0a925b4fb3a88173f4a6dfcc565c6edd91 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 8dd65f1f248daaf120780f19050c45d297b7902e..6da98036094228603f12f65f66d958e2e4b9daeb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 5e30571cc730ee23767a044036b590460deec00b..345593dec812a240251c0c07da759e131fefaad9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index ba90fa454696d1cb4e77d80a2dc77ff65def4714..5d3be9085e51520c48a8d33b1dde6480a0039bad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 8823857758307c208527b144c0cc73b566f2f115..0b79a87e0507ecc795f14a63684dd5b5d7dafc1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 500ced852ba6b19502769ba9052f2e364af7e283..68cdbac652f74b72d1cb769fbefbee750025767d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index cf2717ed46b56e639fb774c1e922648e1653ec0d..d5872b444fa9ae617e6ad55bc39f43ec4be7d92f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index a86ff1a46997f19b11e6ef03be432b45687a2df2..4b0cf9a5d38f868d9fdd16a042daea6d0f56fdf8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index e01cc7c1b09ad6a40380613d54b771c6a1c89c1c..4c1adb2131f204121cf74c9a77d346902632fef2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 259c1fb37c787f5318570b7aca6935d2f0ed997f..815f1cf580562e62db99862e51ba02e2b2051b57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 0c41bf97f763f1e40e8fac714709ccac1483a00b..e027dd6cc282b75a6a6c2a78878fa57d6706b547 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index bec8817aa393ba2d8a6410408938402366cbb01d..c647b24a23258b96a3c1780fd2f3e499658cfe7d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 17be86222901c0f5a9a18c0e5f1c5bcac6c06a17..75d70734b4144627f02a8f619991356e38889389 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 3aeef347ae1f96a3ef40493cc6b722a887e81786..29edabe0483a21d7db35eec04d6ae7a855a82da1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 6d2a8c56196d9b3c80f570c7f1d3ac803253fff6..0ed383a3554f81c3db490cf5d242546a14b64d15 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 490b5b618c65e28f1ae2e01e8d35e7f3973cc180..6d14c9c8f69286552271ca6dab5271a5af48593f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -122,10 +117,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -176,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index 21a65b838af35e2f540eacab823513e7bf54b434..ddf96aba34bf574f4b9046ed932d8f136864f157 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index 127b04738e70c11b2dc1071cf174cf5de23c5133..aca282d62427f5c8186fa3ac86daebd6fba09ce5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -12,10 +11,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +59,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -90,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,11 +93,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 87e49f2ed5b5d73aee5e9aa2511485b1f3f4bcd9..b9c53b43c87bd8c1047663001159d7286360a008 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 1aa3aad3246b83931a47e69a4aa76fdf2b5aee22..2ee566d03b4631ebcff6bcb0b93ec274849c5b67 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5e9dc7d4774c651a186a4e320d0cfd088e87b6b3..db0d0e816a6d863f13c7eb085edf269d71e2f252 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 0d101e5b68cdb2cdf24ed472c724cfc885e3d95d..82008b89d038e26155b8ae952c2943557ed8c35e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index c85cd49ac8ce2c1fc0759671865b7174cd1c1480..31a34a17d04129c8dfe8ec6f98b9fdbc110e13f4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 4f59e330c92f96101c65a9a24f66196e84587ccb..70d24ac75c4f850374fa8cffb881652f97e97d22 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index c0ea0eb0505d20e70d641f2a646a060d7dbfabda..55b16564b30919fb48d97862e2e8cfc0fdda8de5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index ca37ae51314516ae67c7725eb2ccd3d25154e2ac..a230b74c383eff93443d07e48f6818352302d9e7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 3ede2378347f5eddb0e8fae775a0200ea484d3f8..d98f7c39f546048b1483617b24f004d5c9759d14 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index d87e25a7ba8e7cce615431723b53a0106c2b5279..b2e96a4203758270afb8c225a05a481dbc329a84 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index e4df7b48ae6b41400375920a48ef8577bb69376e..0c45bbdf171ad9831e51d2b1ba952fab9eb3d0e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 6bf7c77743c31b6d74df35d827e9d5bc9a25d303..6423d83418aa40b57afb3d5ff22f4ec605183587 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index c14be132b7e406c99841576be8d8fa9ab99aa816..6e17081375b7fbdcb000dfcbc0cd48ab072fe6e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index 72ffbceae01da900778dba1ec14e646aa17b39e5..d01d371da596256873c3799b99c45db01c674ff9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index d3e780c8b22ed580f61ffc3d9b2bad7278391402..d3f5508640e268df484af5adaf66621fa3d92d5b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index a27980a9d17397e558a4b732e3dc332a0c1e8432..44e1007f5420bbb8feda891901c138cb776c071e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 67f991276c6908ff54fd516e84533542a5f60528..8fc3ec33310f531a6ca6948dc20db67543cde69a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index fccea5e8af5ab81e712669ff1b2567d8bde8607e..457d27749504167f2865bc272251bd89b5d3297e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d20663bdb0bc2eea323d35b1e3d4d27122f50472..54eda8ee2121d4fadd73a33010202850d743cc65 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 889fa0a1b58bbd3babd293b7b1b45915a9ee3ca4..711196554698b79c64e1c67bc875ce33c70563c6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c850f3fedc814b20f0f95cc3cf4fd5c973446b5b..815e34a48de542feb078d0002dbebbbf4d199e63 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 526d88ccba60eb25c68432e5baa03fd3a878f718..6614760e5e72556ac61c3788065b3faf2d286800 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 7fddae34472411f49d42b4d65d12034d056ec818..bfcfd71ecddfb618ceb53969e8782f535786009b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -114,10 +109,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -164,11 +155,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index 5b9b62fc970238e49e6d4849285606d0a7908b23..9c4618c4e91ae288284b35c4a0c6bbbfe604d91d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 769da30999993fad05ae0f7c04e256e6cf01a774..9a0a19d2d52f34e61c273bd4d4a27c46940dd5bf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index fca2e42a1519fcf3a9f0ec996c50b148b2df05fd..446f7122a6a2dc1d2f7377cef00d7b5b9a89cd3a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 36e8de09a967c5940bf8078234f5980a78ec8009..52a0485b5cea2be6f2f4d9b0ee31cb2388adcef7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a96f16fae99af9c30959d228202055e9aebfaf58..c82e7a192dfca8ca38832a53f9135125b4c34286 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_size"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index e1cbd0e150ed890ae57c1725249d1340fc2cb663..9ccf251a18034371607cfdc6091f2282136feec9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index f0d35728fb1c42d563ff0598dd84da51a766a764..e080a07799fe1b7ec5f73b4e7bf78053a2c9dd3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -108,11 +99,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 74efaea6ddb22ec2fe9d41558978c183b0e06671..5fadca0b8386951976e8a6330bacf0dcf169e2d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index dc5bd5fd5319f9bbd601a3c4083ae566b47e1aaa..2d395bf7e87b8835bcf63d792d68dc3ac4083051 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index e01ccfb74aead591f1018cdcbb1c888767ecdb20..18d58ec3b23f7d4fd13ff45f4c1d4d95e7722ed5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 7e6f90f7623677244865ac285c134dc79f7b9b69..6223cb2f3c1230a60f3cf3dd57a0e803cf4f15d9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 4d0d402dad442ccf52267f5ce40b05400afbfbc7..e71bba6a7f1df713cd13b4a0249d001d56bb31b0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index b353a529bcf8e543d334fee57fca26ebc83036a4..aba6d8cb1f43bb070f9b17b5290afc5ce30246b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 9fe1256e616dbca4f35101df160dc55bc68bfa8a..ce545ecc954e84e702fbd24047370a3417dc0fb1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 8ccf15f9ab0fcfa59907ff05a962a84d3d86ccb4..3ac285681f596194254079359c9405ca47e6a3df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index be12b0bd2ec509ff394eaa3f43db0b54badd7fba..51ba0c5043f63bb59d73f979f832d071273d4f82 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 1c4f550d7f05b8be33326cb39d7a5f3bf663f5e6..38fd78a5a828c7d0da98c97fdc01f504397c6fe6 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index d2db0952693f2989e6a9e8748a254eb4db483206..86a524cc91e10616cd049cec93843e419ec670c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 34d9a9df281c09a2e2030daf74a2ceb8066085bb..8a811fe4561ac3790e43a5553ca04fca002e420d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 21ad0efecf88c42a3a679910ddfe095585a7933a..3923e706be7a352d770bf309aecd1fadb2a05e81 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index ed38747c7671a267bb640ecb96a4c5fcc46c5edf..7a0a8a2a51295d9009f44e2ea126e8a4694147af 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index ff453c6059477c20528fc768d93c65d208cdfc4a..7ed3a652519a6429ff429925c29f3c296a6d2958 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 5583bd22dce18b0a0593b73bde509818b63b3f29..23831aa74f1c3dea99e6da407e5a63693f94e37a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 63f0c32a7c8f7e530c76c64fa619102bc12f9ad9..9d41a6b09900d984706accd70a353cc26585d9b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index b77726252ccca30a7c6555fb569eb65b69e34998..865fe08e63c81222395d125938c8830c02030733 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index 92db9f6dcd2f77c4253eb77df4a26fb632b2a766..ee164aae204d3f6c09af79f7fbac825ce470098d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 80fa846a24c9162d8521bdb4f098b9cd8e34aedb..8167dc79cdf9f83b9b97557638bf0702a1be86da 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index f63213b3dde40aa54b165c1c269c26fd2cd9e3b4..efa4419692993aa9975c1af2e647288ae9f38eba 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 7c1d05cd2bfc8be978e82566e7a3086040978b4a..2ff89f0a6faef905bcafdcb36121f506e9a9977a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index 4e45b2d513bb72bb47433d72c310d6a34fbc0c01..b3a6dfdffa28d3628e09e6aab823534ba84edf16 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -23,6 +24,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +56,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -81,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -93,7 +102,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +120,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +132,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +160,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +172,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index 19ec33fce775caa634e71e2295ac945a6f70ade9..cef396489dde24698cd9a63b6247292958cfec4e 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index 76180c333a21c592a3b53bb445df9b12d3596552..565f0c7a79661f77d5987d671266ff69268b03b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index ded75c8ff09efc6746ddd2284f53d2c021cc473c..595ce2eeadfc95fe44895ffd976024aee80ee948 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index 3dbfa5453f8e0ebb02429df9c4cbdf98de6b8ced..ccca96f72248e390ca65db061d836ee58c8e3205 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index ab171df1d1650e19836018f3316e6919f6d36def..1c99c9618254cadb1b4e95c7223ca9361e4fa861 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 9c71a24d0500e2091e0ae94cc4dd7ed6b788a54f..f909cd875698bf65b1b005069c4d59f891b0cece 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 9e19f96b7452616956fb7fd3ca62d8f4b25a2122..173d2eae63656ac86d11e9eb051b43489a00560f 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 7540aa62861895a7c41840476d4edb79785a77a9..3c3e38229738fec3b25f437a73f3a9d216d970af 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index fc1ff386690f9c7acb11d4cc0770e394f78350ad..db16660f1145b55c824c698653094977dd6c718b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -107,7 +116,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -121,6 +134,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -129,10 +146,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -145,6 +174,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -153,6 +186,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 751122cfff3bf9c55dd9fa264fdf2e1960940724..d7f658aaee153652b93eef812197322a984b6d44 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 4b6313f395fd8fd4ec2af78365117620263e7a55..b9ab487c77ccfd8a668e891a2dba0770f1f91ea8 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 00e8c71140596ecea237ce05a09feff1fbb49001..b9e3d934759accd885036fa4c5a7013ef64736f3 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 3852f90dd6c4a254e20e789bdeb7796d61cef6bc..75b5898c591cbe2b761c0f709159c5489cb8f76a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +25,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +57,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -90,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,7 +111,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +129,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +141,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +169,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +181,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 8f3f0f7506ef49014b31cd4bc04f1cb1e0d696fc..fee0dc63b997f328a4e3d44040c4056de4128eb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index afa3b78eb7fb3618edce06bbff288c37fdf71015..c66249999f6089afdfd9c1e50c4c948797d6a435 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -912,9 +912,13 @@ tf_module {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_compressed"
+    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
index dcf747971b7b8bf243502b2388da635705b8ee3e..6b65b0ace3cf7740ab03390841c941592000d127 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_event"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
index 62b956c5ef7dc54e92431f25ec948e341c0e1f24..38cc98b48e78aa93f7614a9baff236f7b119f99d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "default_local_init_op"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "finalize"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 0dc154b6d2c6884b3cd91a4f4c7c08825c123124..724b12cd4799eb76fe602c737c850e96e92faa58 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,7 +23,6 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:experimental_tensorflow_py",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 7eeae05847526ca106b80e55c92d3b55f988b149..1ad6b6d1c0ae5ca1ac1329fd49e972840020e4c3 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import sys
 import unittest
 
 import tensorflow as tf
-from tensorflow import experimental_api as api
 
 from google.protobuf import text_format
 
@@ -47,8 +46,6 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
-if hasattr(tf, 'experimental_api'):
-  del tf.experimental_api
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -145,9 +142,6 @@ class ApiCompatibilityTest(test.TestCase):
       verbose_diff_message = ''
       # First check if the key is not found in one or the other.
       if key in only_in_expected:
-        # TODO(annarev): remove once we switch to using tf_export decorators.
-        if key == 'tensorflow.math':
-          continue
         diff_message = 'Object %s expected but not found (removed). %s' % (
             key, additional_missing_object_message)
         verbose_diff_message = diff_message
@@ -232,13 +226,6 @@ class ApiCompatibilityTest(test.TestCase):
         for filename in golden_file_list
     }
 
-    # TODO(annarev): remove once we switch to using tf_export decorators.
-    tf_module = golden_proto_dict['tensorflow'].tf_module
-    for i in range(len(tf_module.member)):
-      if tf_module.member[i].name == 'math':
-        del tf_module.member[i]
-        break
-
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
     self._AssertProtoDictEquals(
@@ -247,49 +234,6 @@ class ApiCompatibilityTest(test.TestCase):
         verbose=FLAGS.verbose_diffs,
         update_goldens=FLAGS.update_goldens)
 
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
-  def testNewAPIBackwardsCompatibility(self):
-    # Extract all API stuff.
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    # TODO(annarev): Make slide_dataset available in API.
-    public_api_visitor.private_map['tf'] = ['slide_dataset']
-    traverse.traverse(api, public_api_visitor)
-
-    proto_dict = visitor.GetProtos()
-
-    # Read all golden files.
-    expression = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    golden_file_list = file_io.get_matching_files(expression)
-
-    def _ReadFileToProto(filename):
-      """Read a filename, create a protobuf from its contents."""
-      ret_val = api_objects_pb2.TFAPIObject()
-      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
-      return ret_val
-
-    golden_proto_dict = {
-        _FileNameToKey(filename): _ReadFileToProto(filename)
-        for filename in golden_file_list
-    }
-
-    # Diff them. Do not fail if called with update.
-    # If the test is run to update goldens, only report diffs but do not fail.
-    self._AssertProtoDictEquals(
-        golden_proto_dict,
-        proto_dict,
-        verbose=FLAGS.verbose_diffs,
-        update_goldens=False,
-        additional_missing_object_message=
-        'Check if tf_export decorator/call is missing for this symbol.')
-
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 3c3b223a0044b7136ea4dee20fa72cd2fed3742a..30554a084c5689768665557d593b928fbd98d8cb 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +40,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 97829892b10059f9d9663e103534891d1481abec..3b437d3c58c384c389820e57ae6bcc57c6c13efb 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -31,6 +31,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog
 :: Set ctest binary location.
 IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -40,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4b9380fa09bd0e371826783d1ebdc45..78cb4d250e84a4a165dd42db6845170c1751ffbe 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 11f476d12c086f70335d9a69d7f3b86b525b5623..390d7442c37b1d10f945d8657f5034403e3e2b96 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
@@ -70,7 +72,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 037d13116efc5ddf76c31eb87d7f81d31c3591f5..c65e0b72bc582d39b75ad042e4c673aa603639be 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.7
+ARG TF_BRANCH=r1.8
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1fcb6428b21b4ca495bef2b3249b6463e9ef0a10..293028d229adba2380d6a91f409d11208c8d875c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
@@ -79,7 +81,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 625321e1235202f78a2d5e1a5b2d9d05e1e3f9ba..9e1708662e79746e54af4409756774a306990438 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5e19c2fbb8c7261d8cf293757df503c..05de25f2cb11d76f223a31bc12329e6ab7368e8a 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 6185c9ddd7b2c044728daa6f14befee3c8add036..111d54d8205f805cc24d21c610acc81610b8d47d 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -311,6 +311,10 @@ def build_doc_index(src_dir):
         continue
       title_parser = _GetMarkdownTitle()
       title_parser.process(os.path.join(dirpath, base_name))
+      if title_parser.title is None:
+        msg = ('`{}` has no markdown title (# title)'.format(
+            os.path.join(dirpath, base_name)))
+        raise ValueError(msg)
       key_parts = os.path.join(suffix, base_name[:-3]).split('/')
       if key_parts[-1] == 'index':
         key_parts = key_parts[:-1]
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b807a585865e2e3f19291e55388d55cb1..2151a75e840103d7dcb5dd5015969c840cc47311 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -139,7 +139,7 @@ def configure(src_base_path, gen_path, debug=False):
     print("gen_git_source.py: spec is %r" % spec)
 
 
-def get_git_version(git_base_path):
+def get_git_version(git_base_path, git_tag_override):
   """Get the git version from the repository.
 
   This function runs `git describe ...` in the path given as `git_base_path`.
@@ -152,6 +152,9 @@ def get_git_version(git_base_path):
 
   Args:
     git_base_path: where the .git directory is located
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   Returns:
     A bytestring representing the git version
   """
@@ -161,6 +164,14 @@ def get_git_version(git_base_path):
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    if git_tag_override:
+      split_val = val.split("-")
+      if len(split_val) != 3:
+        raise Exception(
+            ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
+             "but got '%s'") % val)
+      split_val[0] = git_tag_override
+      val = bytes("-".join(split_val))
     return val if val else unknown_label
   except subprocess.CalledProcessError:
     return unknown_label
@@ -178,7 +189,15 @@ def write_version_info(filename, git_version):
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
@@ -197,7 +216,7 @@ const int tf_monolithic_build() {
   open(filename, "w").write(contents)
 
 
-def generate(arglist):
+def generate(arglist, git_tag_override=None):
   """Generate version_info.cc as given `destination_file`.
 
   Args:
@@ -217,6 +236,10 @@ def generate(arglist):
   `ref_symlink` is unused in this script but passed, because the build
     system uses that file to detect when commits happen.
 
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
+
   Raises:
     RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
   """
@@ -234,11 +257,11 @@ def generate(arglist):
       raise RuntimeError(
           "Run ./configure again, branch was '%s' but is now '%s'" %
           (old_branch, new_branch))
-    git_version = get_git_version(data["path"])
+    git_version = get_git_version(data["path"], git_tag_override)
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, source_dir, git_tag_override=None):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +269,13 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
+    git_tag_override: Override the value for the git tag. This is useful for
+      releases where we want to build the release before the git tag is
+      created.
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(source_dir, git_tag_override)
   write_version_info(output_file, git_version)
 
 
@@ -270,6 +297,11 @@ parser.add_argument(
     "--gen_root_path", type=str,
     help="Root path to place generated git files (created by --configure).")
 
+parser.add_argument(
+    "--git_tag_override", type=str,
+    help="Override git tag value in the __git_version__ string. Useful when "
+         "creating release builds before the release tag is created.")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -281,6 +313,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -288,9 +325,12 @@ if args.configure is not None:
     raise RuntimeError("Must pass --gen_root_path arg when running --configure")
   configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
-  generate(args.generate)
+  generate(args.generate, args.git_tag_override)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path, args.git_tag_override)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index db20bb00e84b47bd15244e70b925f59e62731deb..cd128af6b36f2f99e5cf91961476d30384227e9b 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -28,7 +28,15 @@ fi
 cat <<EOF > ${OUTPUT_FILENAME}
 #include <string>
 const char* tf_git_version() {return "${GIT_VERSION}";}
-const char* tf_compiler_version() {return __VERSION__;}
+const char* tf_compiler_version() {
+#ifdef _MSC_VER
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  return "MSVC " TOSTRING(_MSC_FULL_VER);
+#else
+  return __VERSION__;
+#endif
+}
 const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 28387c2b48c06ecffd2afa0705a8dea5bc368460..8ce8f5e24b9f002e50d456c8ccab8a6414fca724 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
+#if !defined(PLATFORM_WINDOWS)
+#include <pwd.h>
+#endif
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -130,16 +133,64 @@ Status ParseTransformParameters(const string& transforms_string,
   return Status::OK();
 }
 
+std::string ExpandPath(const std::string& path_string) {
+#if defined(PLATFORM_WINDOWS)
+  return path_string;
+#else
+  if (path_string.empty() || path_string[0] != '~') {
+    return path_string;
+  }
+
+  const char* home = NULL;
+  std::string::size_type prefix = path_string.find_first_of('/');
+  if (path_string.length() == 1 || prefix == 1) {
+    // The value of $HOME, e.g., ~/foo
+    home = getenv("HOME");
+    if (!home) {
+      // If HOME is not available, get uid
+      struct passwd* pw = getpwuid(getuid());
+      if (pw) {
+        home = pw->pw_dir;
+      }
+    }
+  } else {
+    // The value of ~user, e.g., ~user/foo
+    std::string user(path_string, 1, (prefix == std::string::npos)
+                                         ? std::string::npos
+                                         : prefix - 1);
+    struct passwd* pw = getpwnam(user.c_str());
+    if (pw) {
+      home = pw->pw_dir;
+    }
+  }
+
+  if (!home) {
+    return path_string;
+  }
+
+  string path(home);
+  if (prefix == std::string::npos) {
+    return path;
+  }
+
+  if (path.length() == 0 || path[path.length() - 1] != '/') {
+    path += '/';
+  }
+  path += path_string.substr(prefix + 1);
+  return path;
+#endif
+}
+
 int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
-  string in_graph = "";
-  string out_graph = "";
+  string in_graph_string = "";
+  string out_graph_string = "";
   string inputs_string = "";
   string outputs_string = "";
   string transforms_string = "";
   bool output_as_text = false;
   std::vector<Flag> flag_list = {
-      Flag("in_graph", &in_graph, "input graph file name"),
-      Flag("out_graph", &out_graph, "output graph file name"),
+      Flag("in_graph", &in_graph_string, "input graph file name"),
+      Flag("out_graph", &out_graph_string, "output graph file name"),
       Flag("inputs", &inputs_string, "inputs"),
       Flag("outputs", &outputs_string, "outputs"),
       Flag("transforms", &transforms_string, "list of transforms"),
@@ -166,11 +217,11 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     LOG(ERROR) << "Unknown argument " << argv[1] << ".\n" << usage;
     return -1;
   }
-  if (in_graph.empty()) {
+  if (in_graph_string.empty()) {
     LOG(ERROR) << "in_graph graph can't be empty.\n" << usage;
     return -1;
   }
-  if (out_graph.empty()) {
+  if (out_graph_string.empty()) {
     LOG(ERROR) << "out_graph graph can't be empty.\n" << usage;
     return -1;
   }
@@ -179,6 +230,9 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     return -1;
   }
 
+  string in_graph = ExpandPath(in_graph_string);
+  string out_graph = ExpandPath(out_graph_string);
+
   std::vector<string> inputs = str_util::Split(inputs_string, ',');
   std::vector<string> outputs = str_util::Split(outputs_string, ',');
   TransformParameters transform_params;
@@ -197,7 +251,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   GraphDef graph_def;
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
-    LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
+    LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with "
                << load_status.error_message();
     LOG(ERROR) << usage;
     return -1;
@@ -219,7 +273,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
     save_status = WriteBinaryProto(Env::Default(), out_graph, graph_def);
   }
   if (!save_status.ok()) {
-    LOG(ERROR) << "Saving graph '" << out_graph << "' failed with "
+    LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with "
                << save_status.error_message();
     return -1;
   }
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index ed941c3bc23c3b0e5e21fcbe03068f174b5887c6..f676f040ad3ebc7f377c7a4c224c5dba95f2f108 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,13 +29,12 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0'
+_VERSION = '1.8.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'grpcio >= 1.8.6',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
@@ -43,6 +42,12 @@ REQUIRED_PACKAGES = [
     'termcolor >= 1.1.0',
 ]
 
+if sys.byteorder == 'little':
+  # grpcio does not build correctly on big-endian machines due to lack of
+  # BoringSSL support.
+  # See https://github.com/tensorflow/tensorflow/issues/17882.
+  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
+
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 018a3950636f4a4d62de922b93c533ef66685b75..f775491e4a29997f2672e9dd9aa413ca98a3f484 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -210,11 +210,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "jpeg",
       urls = [
-          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
-          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
+          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.3.tar.gz",
       ],
-      sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
-      strip_prefix = "libjpeg-turbo-1.5.1",
+      sha256 = "1a17020f859cb12711175a67eab5c71fc1904e04b587046218e36106e07eabde",
+      strip_prefix = "libjpeg-turbo-1.5.3",
       build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
   )
 
@@ -232,11 +232,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "org_sqlite",
       urls = [
-          "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
-          "http://www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
+          "https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
+          "https://www.sqlite.org/2018/sqlite-amalgamation-3230100.zip",
       ],
-      sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
-      strip_prefix = "sqlite-amalgamation-3200000",
+      sha256 = "4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc",
+      strip_prefix = "sqlite-amalgamation-3230100",
       build_file = clean_dep("//third_party:sqlite.BUILD"),
   )
 
@@ -427,11 +427,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
-          "https://github.com/grpc/grpc/archive/bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
+          "https://github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
       ],
-      sha256 = "0a05bd355e4571b01d813dddffa38e57e689ac41b264dc9b1bd6ec66463ef5d6",
-      strip_prefix = "grpc-bd6bdf93279a39a8cd92978fd7c9d14eccd98fc2",
+      sha256 = "b857969c667c14f37faa507afc07a3f39a47fbf73203be889d55925622e7b317",
+      strip_prefix = "grpc-09386db3939cae1ac12e5f09b735adfa8958c68e",
   )
 
 
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/15535accd9e1e9d7772202ce51c8428c1994a04b.tar.gz",
       ],
-      sha256 = "a6d94bd9de23515a1e3792a830421e3885977ea43d03427cdbe68f98cb7e0045",
-      strip_prefix = "llvm-7e78daafdd22f3f17720a103d29d89590534004e",
+      sha256 = "3470c2dde055dc974e859e707aa6cd1d22eadd4f3a1f282e74c3cf1f7dc9510a",
+      strip_prefix = "llvm-15535accd9e1e9d7772202ce51c8428c1994a04b",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 075b46896ed868d8e2e1bcddf6d867974a248313..097bbf5d4212986a2268f87de81457568f88c242 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -2053,6 +2053,7 @@ cc_library(
         "include/llvm/Target/*.def",
         "include/llvm/Target/*.inc",
         "include/llvm/CodeGen/*.def",
+        "include/llvm/CodeGen/*.inc",
     ]),
     deps = [
         ":analysis",